Exemplo n.º 1
0
    def test_execute(self, mock_hook):
        mock_hook.return_value.list.return_value = MOCK_FILES

        operator = GCSListObjectsOperator(
            task_id=TASK_ID, bucket=TEST_BUCKET, prefix=PREFIX, delimiter=DELIMITER
        )

        files = operator.execute(None)
        mock_hook.return_value.list.assert_called_once_with(
            bucket_name=TEST_BUCKET, prefix=PREFIX, delimiter=DELIMITER
        )
        self.assertEqual(sorted(files), sorted(MOCK_FILES))
BUCKET_FILE_LOCATION = PATH_TO_UPLOAD_FILE.rpartition("/")[-1]

with models.DAG(
    "example_gcs", default_args=default_args, schedule_interval=None, tags=['example'],
) as dag:
    create_bucket1 = GCSCreateBucketOperator(
        task_id="create_bucket1", bucket_name=BUCKET_1, project_id=PROJECT_ID
    )

    create_bucket2 = GCSCreateBucketOperator(
        task_id="create_bucket2", bucket_name=BUCKET_2, project_id=PROJECT_ID
    )

    list_buckets = GCSListObjectsOperator(
        task_id="list_buckets", bucket=BUCKET_1
    )

    list_buckets_result = BashOperator(
        task_id="list_buckets_result",
        bash_command="echo \"{{ task_instance.xcom_pull('list_buckets') }}\"",
    )

    upload_file = LocalFilesystemToGCSOperator(
        task_id="upload_file",
        src=PATH_TO_UPLOAD_FILE,
        dst=BUCKET_FILE_LOCATION,
        bucket=BUCKET_1,
    )

    transform_file = GCSFileTransformOperator(
Exemplo n.º 3
0
# Assumes existence of Airflow Variable set to name of GCP Project
PROJECT_ID = models.Variable.get("gcp_project")

with models.DAG(
        "example_gcs",
        start_date=days_ago(1),
        schedule_interval=None,
) as dag:
    generate_uuid = PythonOperator(task_id='generate_uuid',
                                   python_callable=lambda: str(uuid.uuid4()))
    create_bucket = GCSCreateBucketOperator(
        task_id="create_bucket",
        bucket_name="{{ task_instance.xcom_pull('generate_uuid') }}",
        project_id=PROJECT_ID)
    list_objects = GCSListObjectsOperator(
        task_id="list_objects",
        bucket="{{ task_instance.xcom_pull('generate_uuid') }}")
    list_buckets_result = BashOperator(
        task_id="list_buckets_result",
        bash_command="echo \"{{ task_instance.xcom_pull('list_objects') }}\"",
    )
    delete_bucket = GCSDeleteBucketOperator(
        task_id="delete_bucket",
        bucket_name="{{ task_instance.xcom_pull('generate_uuid') }}")

    generate_uuid >> create_bucket >> list_objects >> list_buckets_result >> delete_bucket

if __name__ == "__main__":
    dag.clear(dag_run_state=State.NONE)
    dag.run()
Exemplo n.º 4
0
    if logs:
        return 'branch_a'
    return 'branch_b'


with DAG(
    'tacc_branch_test', 
    default_args=default_args,
    description='Test task: branch',
    schedule_interval='@once'
    ) as dag:


    get_gcs_object_list = GCSListObjectsOperator(
        task_id='get_gcs_object_list',
        bucket='your-bucket-name', 
        prefix='your-path', 
        delimiter='.log'
    )

    branching = BranchPythonOperator(
        task_id='branching', 
        python_callable=_check_if_log_exists
    )

    branch_a = DummyOperator(
        task_id='branch_a'
    )

    branch_b = DummyOperator(
        task_id='branch_b'
    )
Exemplo n.º 5
0
    logs = ti.xcom_pull(task_ids=['check_gcs_logs'], key="return_value")[0]
    if logs:
        # return ['create_dataproc', 'initial_ato_sn_calc_final', 'initial_ato_sn_calc_daily']
        return ['initial_ato_sn_calc_final', 'initial_ato_sn_calc_daily']

    return 'branch_b'


with DAG('etl',
         default_args=default_args,
         description='etl',
         schedule_interval='36 17 * * *') as dag:

    check_gcs_logs = GCSListObjectsOperator(
        task_id='check_gcs_logs',
        bucket=bucket_config['upload_data'],
        prefix='logs/{{ ds_nodash }}',
        delimiter='.log',
        gcp_conn_id=gcp_config['conn_id'])

    branching = BranchPythonOperator(task_id='branching',
                                     python_callable=_check_if_log_exists)

    branch_b = DummyOperator(task_id='branch_b')

    # create_dataproc = DataprocCreateClusterOperator(
    #     task_id="create_dataproc",
    #     project_id=gcp_config["project_id"],
    #     cluster_name=dataproc_config["cluster_name"],
    #     region=gcp_config["region"],
    #     cluster_config=cluster_config,
    #     gcp_conn_id=gcp_config["conn_id"]