def test_execute(self, mock_s3_hook, mock_transfer_hook): """Test the execute function when the run is successful.""" operator = S3ToGoogleCloudStorageTransferOperator( task_id=TASK_ID, s3_bucket=S3_BUCKET, gcs_bucket=GCS_BUCKET, project_id=PROJECT_ID, ) mock_s3_hook.return_value.get_credentials.return_value = Credentials( access_key=ACCESS_KEY, secret_key=SECRET_KEY, ) operator.execute(None) mock_transfer_hook.return_value.create_transfer_job.assert_called_once_with( project_id=PROJECT_ID, transfer_spec={ 'awsS3DataSource': { 'bucketName': S3_BUCKET, 'awsAccessKey': { 'accessKeyId': ACCESS_KEY, 'secretAccessKey': SECRET_KEY, } }, 'gcsDataSink': { 'bucketName': GCS_BUCKET, }, 'objectConditions': {}, 'transferOptions': {} })
def test_constructor(self): """Test S3ToGoogleCloudStorageTransferOperator instance is properly initialized.""" operator = S3ToGoogleCloudStorageTransferOperator( task_id=TASK_ID, s3_bucket=S3_BUCKET, gcs_bucket=GCS_BUCKET, project_id=PROJECT_ID, ) self.assertEqual(operator.task_id, TASK_ID) self.assertEqual(operator.s3_bucket, S3_BUCKET) self.assertEqual(operator.gcs_bucket, GCS_BUCKET) self.assertEqual(operator.project_id, PROJECT_ID)
def test_constructor(self): """Test S3ToGoogleCloudStorageTransferOperator instance is properly initialized.""" operator = S3ToGoogleCloudStorageTransferOperator( task_id=TASK_ID, s3_bucket=S3_BUCKET, gcs_bucket=GCS_BUCKET, project_id=PROJECT_ID, description=DESCRIPTION, schedule=SCHEDULE, ) self.assertEqual(operator.task_id, TASK_ID) self.assertEqual(operator.s3_bucket, S3_BUCKET) self.assertEqual(operator.gcs_bucket, GCS_BUCKET) self.assertEqual(operator.project_id, PROJECT_ID) self.assertEqual(operator.description, DESCRIPTION) self.assertEqual(operator.schedule, SCHEDULE)
def test_execute_skip_wait(self, mock_s3_hook, mock_transfer_hook): """Test the execute function and wait until transfer is complete.""" operator = S3ToGoogleCloudStorageTransferOperator( task_id=TASK_ID, s3_bucket=S3_BUCKET, gcs_bucket=GCS_BUCKET, project_id=PROJECT_ID, description=DESCRIPTION, wait=False, ) mock_s3_hook.return_value.get_credentials.return_value = Credentials( access_key=ACCESS_KEY, secret_key=SECRET_KEY, ) operator.execute(None) mock_transfer_hook.return_value.create_transfer_job.assert_called_once_with( project_id=PROJECT_ID, description=DESCRIPTION, schedule=None, transfer_spec={ 'awsS3DataSource': { 'bucketName': S3_BUCKET, 'awsAccessKey': { 'accessKeyId': ACCESS_KEY, 'secretAccessKey': SECRET_KEY, } }, 'gcsDataSink': { 'bucketName': GCS_BUCKET, }, 'objectConditions': {}, 'transferOptions': {} } ) assert not mock_transfer_hook.return_value.wait_for_transfer_job.called
'--location=US', 'load', '--source_format=CSV', '--skip_leading_rows=0', '--replace', "--field_delimiter=\001", 'blpadi.adi_dimensional_by_date${{ ds_nodash }}', 'gs://moz-fx-data-derived-datasets-blpadi/blpadi/{{ ds }}/*', ] s3_to_gcs = S3ToGoogleCloudStorageTransferOperator( task_id='s3_to_gcs', s3_bucket='net-mozaws-data-us-west-2-data-analysis', gcs_bucket='moz-fx-data-derived-datasets-blpadi', description='blpadi copy from s3 to gcs', aws_conn_id='aws_data_iam_blpadi', gcp_conn_id=gcp_conn_id, project_id=connection.project_id, object_conditions=gcstj_object_conditions, transfer_options=gcstj_transfer_options, dag=blp_dag) load_blpadi_to_bq = GKEPodOperator(task_id='bigquery_load', gcp_conn_id=gcp_conn_id, project_id=connection.project_id, location='us-central1-a', cluster_name='bq-load-gke-1', name='load-blpadi-to-bq', namespace='default', image='google/cloud-sdk:242.0.0-alpine', arguments=bq_args,
def load_to_bigquery(parent_dag_name=None, default_args=None, dataset_s3_bucket=None, aws_conn_id=None, dataset=None, dataset_version=None, gke_cluster_name=None, date_submission_col='submission_date_s3', ds_type='ds_nodash', dag_name='load_to_bigquery', gke_location='us-central1-a', gke_namespace='default', docker_image='docker.io/mozilla/parquet2bigquery:20191017', # noqa reprocess=False, p2b_concurrency='10', p2b_resume=False, p2b_table_alias=None, objects_prefix=None, spark_gs_dataset_location=None, bigquery_dataset='telemetry', dataset_gcs_bucket='moz-fx-data-derived-datasets-parquet', gcp_conn_id='google_cloud_derived_datasets', cluster_by=(), drop=(), rename={}, replace=()): """ Load Parquet data into BigQuery. Used with SubDagOperator. We use S3ToGoogleCloudStorageTransferOperator to create a GCS Transfer Service job to transfer the AWS S3 parquet data into a GCS Bucket. Once that is completed we launch a Kubernates pod on a existing GKE cluster using the GKEPodOperator. :param str parent_dag_name: parent dag name :param dict default_args: dag configuration :param str dataset_s3_bucket: source S3 Bucket :param str dataset_gcs_bucket: destination GCS Bucket :param str aws_conn_id: airflow connection id for S3 access :param str gcp_conn_id: airflow connection id for GCP access :param str dataset: dataset name :param str dataset_version: dataset version :param str date_submission_col: dataset date submission column :param str ds_type: dataset format (ds or ds_nodash) :param str gke_location: GKE cluster zone :param str gke_namespace: GKE cluster namespace :param str docker_image: docker image to use for GKE pod operations # noqa :param str bigquery_dataset: bigquery load destination dataset :param str p2b_concurrency: number of processes for parquet2bigquery load :param str p2b_table_alias: override p2b table name with alias :param str p2b_resume allow resume support. defaults to False :param bool reprocess: enable dataset reprocessing defaults to False :param str objects_prefix: custom objects_prefix to override defaults :param str spark_gs_dataset_location: custom spark dataset load location to override defaults :param List[str] cluster_by: top level fields to cluster by when creating destination table :param List[str] drop: top level fields to exclude from destination table :param Dict[str, str] rename: top level fields to rename in destination table :param List[str] replace: top level field replacement expressions :return airflow.models.DAG """ connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) _dag_name = '{}.{}'.format(parent_dag_name, dag_name) if objects_prefix: _objects_prefix = objects_prefix else: _objects_prefix = '{}/{}/{}={{{{{}}}}}'.format(dataset, dataset_version, date_submission_col, ds_type) gcs_buckets = { 'transfer': dataset_gcs_bucket, 'load': dataset_gcs_bucket, } gcstj_object_conditions = { 'includePrefixes': _objects_prefix } gcstj_transfer_options = { 'deleteObjectsUniqueInSink': True } gke_args = [ '-d', bigquery_dataset, '-c', p2b_concurrency, '-b', gcs_buckets['load'], ] if not p2b_resume: gke_args += ['-R'] if p2b_table_alias: gke_args += ['-a', p2b_table_alias] if reprocess: reprocess_objects_prefix = _objects_prefix.replace('_nodash', '') gcs_buckets['transfer'] += '-tmp' gke_args += ['-p', reprocess_objects_prefix] else: gke_args += ['-p', _objects_prefix] if cluster_by: gke_args += ['--cluster-by'] + cluster_by if drop: gke_args += ['--drop'] + drop if rename: gke_args += ['--rename'] + [k + "=" + v for k, v in rename.items()] if replace: gke_args += ['--replace'] + replace bq_table_name = p2b_table_alias or normalize_table_id('_'.join([dataset, dataset_version])) with models.DAG(_dag_name, default_args=default_args) as dag: if dataset_s3_bucket is not None: s3_to_gcs = S3ToGoogleCloudStorageTransferOperator( task_id='s3_to_gcs', s3_bucket=dataset_s3_bucket, gcs_bucket=gcs_buckets['transfer'], description=_objects_prefix, aws_conn_id=aws_conn_id, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, object_conditions=gcstj_object_conditions, transfer_options=gcstj_transfer_options, ) else: s3_to_gcs = DummyOperator(task_id='no_s3_to_gcs') reprocess = SubDagOperator( subdag=reprocess_parquet( _dag_name, default_args, reprocess, gcp_conn_id, gcs_buckets, _objects_prefix, date_submission_col, dataset, dataset_version, gs_dataset_location=spark_gs_dataset_location), task_id='reprocess_parquet') remove_bq_table = BigQueryTableDeleteOperator( task_id='remove_bq_table', bigquery_conn_id=gcp_conn_id, deletion_dataset_table='{}.{}${{{{ds_nodash}}}}'.format(bigquery_dataset, bq_table_name), # noqa ignore_if_missing=True, ) bulk_load = GKEPodOperator( task_id='bigquery_load', gcp_conn_id=gcp_conn_id, project_id=connection.project_id, location=gke_location, cluster_name=gke_cluster_name, name=_dag_name.replace('_', '-'), namespace=gke_namespace, image=docker_image, arguments=gke_args, ) s3_to_gcs >> reprocess >> remove_bq_table >> bulk_load return dag
project_id = 'my-gcp-project' gcs_bucket = 's3-to-bq-' + s3_bucket # temporary bucket to store file dag_id = re.sub('[^0-9a-zA-Z]+', '_', include_prefix) # use s3 prefix as dag name so it is unique default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': start_date, 'end_date': end_date, 'email': email_alert, 'email_on_failure': True, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } with DAG(dag_id, schedule_interval=schedule_interval, default_args=default_args) as dag: s3_to_gcs = S3ToGoogleCloudStorageTransferOperator( task_id='s3_to_gcs', s3_bucket=s3_bucket, project_id=project_id, gcs_bucket=gcs_bucket, description= '_'.join([gcs_bucket, include_prefix]), object_conditions={ 'include_prefixes': [ include_prefix ] }, replace=True ) s3_to_gcs
gcs_data_bucket = 'moz-fx-data-prod-socorro-data' dataset = 'socorro_crash' dataset_version = 'v2' date_submission_col = 'crash_date' objects_prefix = '{}/{}/{}={}'.format(dataset, dataset_version, date_submission_col, "{{ ds_nodash }}") # copy json crashstats from s3 to gcs s3_to_gcs = S3ToGoogleCloudStorageTransferOperator( task_id='s3_to_gcs', s3_bucket='crashstats-telemetry-crashes-prod-us-west-2', gcs_bucket=gcs_data_bucket, description='socorro crash report copy from s3 to gcs', aws_conn_id=read_aws_conn_id, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, object_conditions={'includePrefixes': 'v1/crash_report/{{ ds_nodash }}'}, transfer_options={'deleteObjectsUniqueInSink': True}, dag=dag, ) # Spark job reads gcs json and writes gcs parquet crash_report_parquet = SubDagOperator( task_id="crash_report_parquet", dag=dag, subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name='crash_report_parquet', default_args=default_args, cluster_name=cluster_name,