Пример #1
0
    def test_poke(self, mock_hook):
        s = S3PrefixSensor(task_id='s3_prefix',
                           bucket_name='bucket',
                           prefix='prefix')

        mock_hook.return_value.check_for_prefix.return_value = False
        self.assertFalse(s.poke(None))
        mock_hook.return_value.check_for_prefix.assert_called_once_with(
            prefix='prefix', delimiter='/', bucket_name='bucket')

        mock_hook.return_value.check_for_prefix.return_value = True
        self.assertTrue(s.poke(None))
Пример #2
0
def get_check_cc_index_in_s3_sensor(dag, aws_conn_id):
    return S3PrefixSensor(
        task_id="check_for_cc_index",
        retries=0,
        aws_conn_id=aws_conn_id,
        bucket_name="commoncrawl",
        prefix=f"crawl-data/{_get_cc_index_template()}",
        poke_interval=60,
        timeout=60 * 60 * 24 * 3,
        soft_fail=True,
        mode="reschedule",
    )
    'retries': 0,
    'retry_delay': timedelta(minutes=2),
    'provide_context': True,
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False
}

# Initialize the DAG
dag = DAG('data_pipeline',
          default_args=default_args,
          dagrun_timeout=timedelta(hours=2),
          schedule_interval='0 3 * * *')

s3_sensor = S3PrefixSensor(task_id='s3_sensor',
                           bucket_name=S3_BUCKET_NAME,
                           prefix='raw/green',
                           dag=dag)

aws_glue_task = AWSGlueJobOperator(task_id="glue_task",
                                   job_name='nyc_raw_to_transform',
                                   iam_role_name='AWSGlueServiceRoleDefault',
                                   dag=dag)

S3_URI = "s3://{}/scripts/nyc_aggregations.py".format(S3_BUCKET_NAME)

SPARK_TEST_STEPS = [{
    'Name': 'setup - copy files',
    'ActionOnFailure': 'CANCEL_AND_WAIT',
    'HadoopJarStep': {
        'Jar': 'command-runner.jar',
        'Args': ['aws', 's3', 'cp', S3_URI, '/home/hadoop/']
Пример #4
0
            >
        )
        PARTITIONED BY (
            `dt` string,
            `hour` string)
        ROW FORMAT SERDE
            'org.openx.data.jsonserde.JsonSerDe'
        with SERDEPROPERTIES("ignore.malformed.json"="true")
        LOCATION 's3a://opay-bi/obus_buried/obdm.client_event'
    """,
                                        schema='obus_dw_ods',
                                        dag=dag)

check_s3_obus_client_event = S3PrefixSensor(
    task_id='check_s3_obus_client_event',
    prefix=
    'obus_buried/obdm.client_event/dt={{ ds }}/hour={{ execution_date.strftime("%H") }}',
    bucket_name='opay-bi',
    dag=dag)

add_partitions_obus_client_event = HiveOperator(
    task_id='add_partitions_obus_client_event',
    hql="""
            ALTER TABLE ods_log_client_event ADD IF NOT EXISTS PARTITION (dt = '{{ ds }}', hour = '{{ execution_date.strftime("%H") }}');
        """,
    schema='obus_dw_ods',
    dag=dag)

create_obus_server_event = HiveOperator(task_id='create_obus_server_event',
                                        hql="""
        CREATE TABLE IF NOT EXISTS `ods_log_server_event`(
            `ip` string,