def test_execute_timeout(self, mock_hook):
     task = GoogleCloudStoragePrefixSensor(task_id="task-id",
                                           bucket=TEST_BUCKET,
                                           prefix=TEST_PREFIX,
                                           poke_interval=0,
                                           timeout=1)
     mock_hook.return_value.list.return_value = []
     with self.assertRaises(AirflowSensorTimeout):
         task.execute(mock.MagicMock)
         mock_hook.return_value.list.assert_called_once_with(
             TEST_BUCKET, prefix=TEST_PREFIX)
    def test_should_return_false_on_empty_list(self, mock_hook):
        task = GoogleCloudStoragePrefixSensor(
            task_id="task-id",
            bucket=TEST_BUCKET,
            prefix=TEST_PREFIX,
            google_cloud_conn_id=TEST_GCP_CONN_ID,
            delegate_to=TEST_DELEGATE_TO,
        )
        mock_hook.return_value.list.return_value = []
        result = task.poke(mock.MagicMock)

        self.assertEqual(False, result)
    def test_should_pass_arguments_to_hook(self, mock_hook):
        task = GoogleCloudStoragePrefixSensor(
            task_id="task-id",
            bucket=TEST_BUCKET,
            prefix=TEST_PREFIX,
            google_cloud_conn_id=TEST_GCP_CONN_ID,
            delegate_to=TEST_DELEGATE_TO,
        )
        mock_hook.return_value.list.return_value = ["NOT_EMPTY_LIST"]
        result = task.poke(mock.MagicMock)

        mock_hook.assert_called_once_with(
            delegate_to=TEST_DELEGATE_TO,
            google_cloud_storage_conn_id=TEST_GCP_CONN_ID
        )
        mock_hook.return_value.list.assert_called_once_with(TEST_BUCKET, prefix=TEST_PREFIX)
        self.assertEqual(True, result)
    def test_execute(self, mock_hook):
        task = GoogleCloudStoragePrefixSensor(
            task_id="task-id",
            bucket=TEST_BUCKET,
            prefix=TEST_PREFIX,
            google_cloud_conn_id=TEST_GCP_CONN_ID,
            delegate_to=TEST_DELEGATE_TO,
            poke_interval=0)
        generated_messages = ['test-prefix/obj%s' % i for i in range(5)]
        mock_hook.return_value.list.return_value = generated_messages

        response = task.execute(None)

        mock_hook.assert_called_once_with(
            delegate_to=TEST_DELEGATE_TO,
            google_cloud_storage_conn_id=TEST_GCP_CONN_ID)
        mock_hook.return_value.list.assert_called_once_with(TEST_BUCKET,
                                                            prefix=TEST_PREFIX)
        self.assertEqual(response, generated_messages)
Пример #5
0
    False,
    'start_date':
    datetime(2021, 2, 22, 0, 0, tzinfo=pendulum.timezone('Asia/Tokyo')),
    'email_on_failure':
    False,
    'email_on_retry':
    False,
    'retries':
    0,
}
dag = DAG(DAG_NAME,
          schedule_interval=None,
          default_args=default_args,
          catchup=False)

# TASK設定
sensor = GoogleCloudStoragePrefixSensor(
    task_id='sensor',
    # 監視先バケット
    bucket=consts.PROJECT_ID + '-' + consts.CSV_BUCKET,
    # 監視先バケット配下のファイル(前方一致)
    prefix='{}/{}'.format(consts.FOLDER_NAME + consts.DATA_NAME),
    # 監視を持続させる時間(秒)
    # デフォルトは60 * 60 * 24 * 7。
    timeout=60 * 60,
    # 監視持続時間を過ぎてもファイルを検知しなかった場合に自TASKを失敗とするかどうか
    # FalseにするとSkippedになる
    soft_fail=True,
    dag=dag,
)
Пример #6
0
    'retries': 1,
    'retry_delay': timedelta(minutes=10),
}


def demo(argument, **context):
    print(argument)
    print('input parameters:')
    print(context['dag_run'].conf)


with DAG('demo_dag', schedule_interval=None, default_args=default_args) as dag:

    wait_task = GoogleCloudStoragePrefixSensor(
        task_id='filesensor',
        bucket='{{var.value.gcs_bucket}}',
        prefix='{{var.value.gcs_file}}',
        google_cloud_conn_id='google_cloud_default',
        dag=dag)

    transform_file = GCSFileTransformOperator(
        task_id="transform_file",
        source_bucket='{{var.value.gcs_bucket}}',
        source_object='devfest',
        destination_bucket='{{var.value.gcs_bucket}}',
        destination_object='new_file',
        transform_script=["cp"],
        google_cloud_conn_id='google_cloud_default',
    )

    wait_task >> transform_file
Пример #7
0
        bash_command=f"mkdir -p {staging_directory}")
    # Deleting the working dir
    cleanup_staging_dir = BashOperator(
        task_id=f"cleanup-staging-dir",
        bash_command=f"rm -rf {staging_directory}")

    # Below loop iterates through each of the dicts inside the list. i.e. Iterates over files to be ingested into BQ
    for config in file_configurations:

        filename = f"{config['base_filename']}"

        # Out of the box Sensor for checking inside the gcs bucket against the prefix provided.
        file_sensor = GoogleCloudStoragePrefixSensor(
            task_id=f"wait-for-{filename}",
            bucket=f'{variables["input_bucket"]}',
            prefix=filename,
            timeout=10000,
            poke_interval=2000,
            mode="reschedule",
        )
        """
        Once the above task is finished and sensor has verified the file, then below custom op will copy the files to 
        staging dir inside airflow bucket.
        """
        copy_file_to_staging = GcstoGcsWithDestFilename(
            task_id=f"copy-{filename}-to-staging",
            source_bucket=variables["input_bucket"],
            source_object=filename + ".csv",
            destination_bucket=variables["airflow_home_bucket"],
            destination_object="data/tempfiles/vehicle_options_ingest/" +
            filename + ".csv",
        )
Пример #8
0
        date_offset) + ")).strftime(\"%Y\") }}"

    return {"date": date_config, "month": month_config, "year": year_config}


def gcs_prefix_check(date_offset):
    ''' returns string in format YYYY/MM/DD emulating sample directory structure in GCS'''

    date_dict = dynamic_date(date_offset)
    return date_dict["year"] + "/" + date_dict["month"] + "/" + date_dict[
        "date"]


gcs_prefix_check = GoogleCloudStoragePrefixSensor(
    dag=dag,
    task_id="gcs_prefix_check",
    bucket="example-bucket",
    prefix="dir1/dir2" + gcs_prefix_check(3)
)  # GoogleCloudStoragePrefixSensor checks GCS for the existence of any BLOB which matches operator's prefix

start_cluster_example = DataprocClusterCreateOperator(
    dag=dag,
    task_id='start_cluster_example',
    cluster_name='example-{{ ds }}',
    project_id="your-project-id",
    num_workers=2,
    num_preemptible_workers=4,
    master_machine_type='n1-standard-4',
    worker_machine_type='n1-standard-4c',
    worker_disk_size=300,
    master_disk_size=300,
    image_version='1.4-debian9',
params = params.generate_params(properties)

with DAG('bq_customer_loyalty',
         default_args=properties.default_args,
         schedule_interval=datetime.timedelta(days=1),
         catchup=False) as dag:

    inbound_bucket = params['inbound_bucket']
    inbound_dir = params['inbound_dir']
    google_coud_conn_id = params['google_cloud_conn_id']

    inbound_full_path = 'gs://%s/%s' % (inbound_bucket, inbound_dir)

    files_sensor_task = GoogleCloudStoragePrefixSensor(
        task_id="gs_sensor",
        google_cloud_conn_id=google_coud_conn_id,
        bucket=get_bucket_from_name(inbound_full_path),
        prefix=inbound_dir)

    generate_batch_id = PythonOperator(task_id='batch-id-gen',
                                       provide_context=True,
                                       python_callable=generate_batch_id)

    clear_xcom = PythonOperator(task_id='delete_xcom_bach_id',
                                provide_context=True,
                                python_callable=clear_xcom_bach_id,
                                trigger_rule=TriggerRule.ALL_DONE)

    # OFFER_EMAIL_LIST
    offer_and_email_list = dag_generator.generate_dag(
        branch_name='offer_email_list', params=params, dag=dag)
Пример #10
0
    # listen customers FTP server folder w/ sensor
    # define sftp-default connection in Airflow UI
    # t1 = SFTPSensor(
    #     task_id='listen-sftp-server',
    # )

    # TODO: not implemented
    # copy from sftp to gcp storage's incoming folder
    # scenario will start right from here !
    # t2 = SFTPOperator(
    #     task_id='transfer-to-incoming',
    # )

    # Listen incoming folder w/ sensor
    t3 = GoogleCloudStoragePrefixSensor(task_id='listen-incoming-file',
                                        bucket='datalake-datasets-123',
                                        prefix='incoming/sales_transactions_*')

    # TODO: better file structure can be defined, such as monthly aggregation datalake/sales/05/sales_transactions_*
    # copy from gcs to datalake for raw data storing
    t4 = GoogleCloudStorageToGoogleCloudStorageOperator(
        task_id='copy-to-datalake',
        source_bucket=INGESTION_BUCKET_NAME,
        source_object='incoming/sales_transactions_*',
        destination_bucket=INGESTION_BUCKET_NAME,
        destination_object='datalake/sales_transactions_',
        move_object=False)

    # copy from incoming to process for analytical calculations
    t5 = GoogleCloudStorageToGoogleCloudStorageOperator(
        task_id='move-to-processing',

def cleanup_all_blobs(**context):
    blob_names = context['task_instance'].xcom_pull(task_ids='list_blobs_task')
    for blob_name in blob_names:
        logging.info('Deleting blob : %s', blob_name)
        delete_blob(bucket_name=watch_bucket, blob_name=blob_name)


start_task = DummyOperator(task_id="Start", retries=0, dag=dag)
end_task = DummyOperator(task_id="End", retries=0, dag=dag)

blob_exists_check = GoogleCloudStoragePrefixSensor(task_id='blob_exist_check',
                                                   bucket=watch_bucket,
                                                   prefix="rtf_",
                                                   poke_interval=30,
                                                   timeout=10,
                                                   soft_fail=True,
                                                   provide_context=True,
                                                   dag=dag)

blob_list_task = PythonOperator(task_id='list_blobs_task',
                                python_callable=fetch_blobs_list,
                                dag=dag,
                                provide_context=True)
blob_move_task = PythonOperator(task_id='copy_blobs_task',
                                python_callable=copy_all_blobs,
                                dag=dag,
                                provide_context=True)
blob_delete_task = PythonOperator(task_id='delete_blobs_task',
                                  python_callable=cleanup_all_blobs,
                                  dag=dag,
Пример #12
0
 def __init__(self, prefixes: list = ['*'], *args, **kwargs):
     GoogleCloudStoragePrefixSensor.__init__(self,
                                             prefix='*',
                                             *args,
                                             **kwargs)
     self.prefixes = prefixes