def test_delete_objects(self, mock_hook): operator = GoogleCloudStorageDeleteOperator(task_id=TASK_ID, bucket_name=TEST_BUCKET, objects=MOCK_FILES[0:2]) operator.execute(None) mock_hook.return_value.list.assert_not_called() mock_hook.return_value.delete.assert_has_calls(calls=[ mock.call(bucket_name=TEST_BUCKET, object_name=MOCK_FILES[0]), mock.call(bucket_name=TEST_BUCKET, object_name=MOCK_FILES[1]) ], any_order=True)
def test_delete_prefix(self, mock_hook): mock_hook.return_value.list.return_value = MOCK_FILES[1:3] operator = GoogleCloudStorageDeleteOperator(task_id=TASK_ID, bucket_name=TEST_BUCKET, prefix=PREFIX) operator.execute(None) mock_hook.return_value.list.assert_called_once_with( bucket_name=TEST_BUCKET, prefix=PREFIX) mock_hook.return_value.delete.assert_has_calls(calls=[ mock.call(bucket_name=TEST_BUCKET, object_name=MOCK_FILES[1]), mock.call(bucket_name=TEST_BUCKET, object_name=MOCK_FILES[2]) ], any_order=True)
def extract_channel_subdag( parent_dag_name, child_dag_name, default_args, schedule_interval, dataset_id, channel, ): dag = DAG( dag_id="{}.{}".format(parent_dag_name, child_dag_name), default_args=default_args, schedule_interval=schedule_interval, ) bq_extract_table = "glam_extract_firefox_{}_v1".format(channel) etl_query = bigquery_etl_query( task_id="glam_client_probe_counts_{}_extract".format(channel), destination_table=bq_extract_table, dataset_id=dataset_id, project_id=project_id, owner="*****@*****.**", email=[ "*****@*****.**", "*****@*****.**", "*****@*****.**", ], date_partition_parameter=None, arguments=("--replace", ), sql_file_path= "sql/moz-fx-data-shared-prod/{}/glam_client_probe_counts_extract_v1/query.sql" .format(dataset_id), parameters=("channel:STRING:{}".format(channel), ), dag=dag, ) gcs_delete = GoogleCloudStorageDeleteOperator( task_id="glam_gcs_delete_old_{}_extracts".format(channel), bucket_name=glam_bucket, prefix="aggs-desktop-{}".format(channel), google_cloud_storage_conn_id=gcp_conn.gcp_conn_id, dag=dag, ) gcs_destination = "gs://{bucket}/aggs-desktop-{channel}-*.csv".format( bucket=glam_bucket, channel=channel) bq2gcs = BigQueryToCloudStorageOperator( task_id="glam_extract_{}_to_csv".format(channel), source_project_dataset_table="{}.{}.{}".format(project_id, dataset_id, bq_extract_table), destination_cloud_storage_uris=gcs_destination, bigquery_conn_id=gcp_conn.gcp_conn_id, export_format="CSV", print_header=False, dag=dag, ) etl_query >> gcs_delete >> bq2gcs return dag
def extract_channel_subdag( parent_dag_name, child_dag_name, default_args, schedule_interval, dataset_id, channel, ): dag = DAG( dag_id="{}.{}".format(parent_dag_name, child_dag_name), default_args=default_args, schedule_interval=schedule_interval, ) bq_extract_table = "glam_client_probe_counts_{}_extract_v1".format(channel) glam_client_probe_counts_extract = bigquery_etl_query( task_id="glam_client_probe_counts_{}_extract".format(channel), destination_table=bq_extract_table, dataset_id=dataset_id, project_id=project_id, owner="*****@*****.**", email=[ "*****@*****.**", "*****@*****.**", "*****@*****.**", ], date_partition_parameter=None, arguments=("--replace", ), dag=dag, ) glam_gcs_delete_old_extracts = GoogleCloudStorageDeleteOperator( task_id="glam_gcs_delete_old_{}_extracts".format(channel), bucket_name=glam_bucket, prefix="extract-desktop-{}".format(channel), google_cloud_storage_conn_id=gcp_conn.gcp_conn_id, dag=dag, ) gcs_destination = "gs://{}/extract-desktop-{}-*.csv".format( glam_bucket, channel) glam_extract_to_csv = BigQueryToCloudStorageOperator( task_id="glam_extract_{}_to_csv".format(channel), source_project_dataset_table="{}.{}.{}".format(project_id, dataset_id, bq_extract_table), destination_cloud_storage_uris=gcs_destination, bigquery_conn_id=gcp_conn.gcp_conn_id, export_format="CSV", print_header=False, dag=dag, ) glam_client_probe_counts_extract >> glam_gcs_delete_old_extracts >> glam_extract_to_csv return dag
def extract_user_counts(parent_dag_name, child_dag_name, default_args, schedule_interval, dataset_id): dag = DAG( dag_id="{}.{}".format(parent_dag_name, child_dag_name), default_args=default_args, schedule_interval=schedule_interval, ) bq_extract_table = "glam_user_counts_extract_v1" etl_query = bigquery_etl_query( task_id="glam_user_counts_extract", destination_table=bq_extract_table, dataset_id=dataset_id, project_id=project_id, owner="*****@*****.**", email=[ "*****@*****.**", "*****@*****.**", "*****@*****.**", ], date_partition_parameter=None, arguments=("--replace", ), dag=dag, ) gcs_delete = GoogleCloudStorageDeleteOperator( task_id="glam_gcs_delete_count_extracts", bucket_name=glam_bucket, prefix="glam-extract-firefox-counts", google_cloud_storage_conn_id=gcp_conn.gcp_conn_id, dag=dag, ) gcs_destination = "gs://{}/glam-extract-firefox-counts.csv".format( glam_bucket) bq2gcs = BigQueryToCloudStorageOperator( task_id="glam_extract_user_counts_to_csv", source_project_dataset_table="{}.{}.{}".format(project_id, dataset_id, bq_extract_table), destination_cloud_storage_uris=gcs_destination, bigquery_conn_id=gcp_conn.gcp_conn_id, export_format="CSV", print_header=False, dag=dag, ) etl_query >> gcs_delete >> bq2gcs return dag
task_id="export_main_avro", cmds=["bash"], command=[ "bin/export-avro.sh", "moz-fx-data-shared-prod", "moz-fx-data-shared-prod:analysis", "gs://moz-fx-data-derived-datasets-parquet-tmp/avro/mozaggregator/prerelease", "main_v4", "'nightly', 'beta'", "{{ ds }}", ], docker_image="mozilla/python_mozaggregator:latest", dag=dag, ).set_downstream(prerelease_telemetry_aggregate_view_dataproc) # Delete the GCS data GoogleCloudStorageDeleteOperator( task_id="delete_main_avro", bucket_name="moz-fx-data-derived-datasets-parquet-tmp", prefix= "avro/mozaggregator/prerelease/moz-fx-data-shared-prod/{{ ds_nodash }}/main_v4", google_cloud_storage_conn_id=gcp_conn.gcp_conn_id, dag=dag, ).set_upstream(prerelease_telemetry_aggregate_view_dataproc) # copy over artifacts if we're running in dev if is_dev: copy_to_dev = copy_artifacts_dev(dag, project_id, artifact_bucket, storage_bucket) copy_to_dev.set_downstream(prerelease_telemetry_aggregate_view_dataproc)
) export_csv = gke_command( task_id="export_csv", cmds=["bash"], env_vars={"DATASET": "glam_etl"}, command=["script/glam/export_csv"], docker_image="mozilla/bigquery-etl:latest", gcp_conn_id="google_cloud_derived_datasets", dag=dag, ) gcs_delete = GoogleCloudStorageDeleteOperator( task_id="gcs_delete", bucket_name=glam_bucket, prefix="glam-extract-fenix", gcp_conn_id="google_cloud_airflow_dataproc", dag=dag, ) gcs_copy = GoogleCloudStorageToGoogleCloudStorageOperator( task_id="gcs_copy", source_bucket="glam-fenix-dev", source_object="*.csv", destination_bucket=glam_bucket, gcp_conn_id="google_cloud_airflow_dataproc", dag=dag, ) wait_for_copy_deduplicate >> run_sql >> export_csv >> gcs_delete >> gcs_copy
object_name=BUCKET_FILE_LOCATION, entity=GCS_ACL_ENTITY, role=GCS_ACL_OBJECT_ROLE, task_id="gcs_object_create_acl_entry_task", ) # [END howto_operator_gcs_object_create_acl_entry_task] download_file = GoogleCloudStorageDownloadOperator( task_id="download_file", object_name=BUCKET_FILE_LOCATION, bucket=BUCKET_1, filename=PATH_TO_SAVED_FILE, ) copy_file = GoogleCloudStorageToGoogleCloudStorageOperator( task_id="copy_file", source_bucket=BUCKET_1, source_object=BUCKET_FILE_LOCATION, destination_bucket=BUCKET_2, destination_object=BUCKET_FILE_LOCATION, ) delete_files = GoogleCloudStorageDeleteOperator(task_id="delete_files", bucket_name=BUCKET_1, prefix="") [create_bucket1, create_bucket2] >> list_buckets >> list_buckets_result [create_bucket1, create_bucket2] >> upload_file upload_file >> [download_file, copy_file] upload_file >> gcs_bucket_create_acl_entry_task >> gcs_object_create_acl_entry_task >> delete_files
def export_to_parquet( table, destination_table=None, static_partitions=[], arguments=[], use_storage_api=False, dag_name="export_to_parquet", parent_dag_name=None, default_args=None, gcp_conn_id="google_cloud_derived_datasets", dataproc_zone="us-central1-a", dataproc_storage_bucket="moz-fx-data-derived-datasets-parquet", num_workers=2, num_preemptible_workers=0, gcs_output_bucket="moz-fx-data-derived-datasets-parquet", ): """ Export a BigQuery table to Parquet. https://github.com/mozilla/bigquery-etl/blob/master/script/pyspark/export_to_parquet.py :param str table: [Required] BigQuery table name :param Optional[str] destination_table: Output table name, defaults to table, will have r'_v[0-9]+$' replaced with r'/v[0-9]+' :param List[str] arguments: Additional pyspark arguments :param bool use_storage_api: Whether to read from the BigQuery Storage API or an AVRO export :param str dag_name: Name of DAG :param Optional[str] parent_dag_name: Parent DAG name :param Optional[Dict[str, Any]] default_args: DAG configuration :param str gcp_conn_id: Airflow connection id for GCP access :param str dataproc_storage_bucket: Dataproc staging GCS bucket :param str dataproc_zone: GCP zone to launch dataproc clusters :param int num_preemptible_workers: Number of Dataproc preemptible workers :return: airflow.models.DAG """ # remove the dataset prefix and partition suffix from table table_id = table.rsplit(".", 1)[-1] unqualified_table, _, partition_id = table_id.partition("$") # limit cluster name to 35 characters plus suffix of -export-YYYYMMDD (51 total) cluster_name = unqualified_table.replace("_", "-") if len(cluster_name) > 35: # preserve version when truncating cluster name to 42 characters prefix, version = re.match(r"(.*?)(-v[0-9]+)?$", cluster_name).groups("") cluster_name = prefix[:35 - len(version)] + version cluster_name += "-export-{{ ds_nodash }}" dag_prefix = parent_dag_name + "." if parent_dag_name else "" connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) if destination_table is None: destination_table = unqualified_table # separate version using "/" instead of "_" export_prefix = re.sub(r"_(v[0-9]+)$", r"/\1", destination_table) + "/" if static_partitions: export_prefix += "/".join(static_partitions) + "/" avro_prefix = "avro/" + export_prefix if not static_partitions and partition_id: avro_prefix += "partition_id=" + partition_id + "/" avro_path = "gs://" + gcs_output_bucket + "/" + avro_prefix + "*.avro" with models.DAG(dag_id=dag_prefix + dag_name, default_args=default_args) as dag: create_dataproc_cluster = DataprocClusterCreateOperator( task_id="create_dataproc_cluster", cluster_name=cluster_name, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, num_workers=num_workers, image_version="1.4", storage_bucket=dataproc_storage_bucket, zone=dataproc_zone, master_machine_type="n1-standard-8", worker_machine_type="n1-standard-8", num_preemptible_workers=num_preemptible_workers, init_actions_uris=[ "gs://dataproc-initialization-actions/python/pip-install.sh", ], metadata={"PIP_PACKAGES": "google-cloud-bigquery==1.20.0"}, ) run_dataproc_pyspark = DataProcPySparkOperator( task_id="run_dataproc_pyspark", cluster_name=cluster_name, dataproc_pyspark_jars=[ "gs://spark-lib/bigquery/spark-bigquery-latest.jar" ], dataproc_pyspark_properties={ "spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.4", }, main="https://raw.githubusercontent.com/mozilla/bigquery-etl/master" "/script/pyspark/export_to_parquet.py", arguments=[table] + [ "--" + key + "=" + value for key, value in { "avro-path": (not use_storage_api) and avro_path, "destination": "gs://" + gcs_output_bucket, "destination-table": destination_table, }.items() if value ] + (["--static-partitions"] if static_partitions else []) + [static_partitions] + arguments, gcp_conn_id=gcp_conn_id, ) delete_dataproc_cluster = DataprocClusterDeleteOperator( task_id="delete_dataproc_cluster", cluster_name=cluster_name, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, trigger_rule=trigger_rule.TriggerRule.ALL_DONE, ) if not use_storage_api: avro_export = BigQueryToCloudStorageOperator( task_id="avro_export", source_project_dataset_table=table, destination_cloud_storage_uris=avro_path, compression=None, export_format="AVRO", bigquery_conn_id=gcp_conn_id, ) avro_delete = GoogleCloudStorageDeleteOperator( task_id="avro_delete", bucket_name=gcs_output_bucket, prefix=avro_prefix, google_cloud_storage_conn_id=gcp_conn_id, trigger_rule=trigger_rule.TriggerRule.ALL_DONE, ) avro_export >> run_dataproc_pyspark >> avro_delete create_dataproc_cluster >> run_dataproc_pyspark >> delete_dataproc_cluster return dag
def export_to_amplitude( parent_dag_name, dag_name, default_args, project, dataset, table_or_view, s3_prefix, gcs_bucket='moz-fx-data-derived-datasets-amplitude-export', gcp_conn_id='google_cloud_derived_datasets', amplitude_s3_conn='amplitude_s3_conn', amplitude_s3_bucket='com-amplitude-vacuum-mozilla-vacuum-wup'): """Export a bigquery table or view to Amplitude. This uses the BigQueryToCloudStorage operator to export the partition to GCS, then pushes that data to S3. It operates on a temporary table that is dropped after the job is finished. :param str parent_dag_name: Parent dag name :param str dag_name: This dag's name (appended to parent_dag_name) :param str default_args: DAG configuration :param str dataset: BigQuery project containing the table to be exported :param str dataset: BigQuery dataset :param str table_or_view: Table or view name :param str gcs_bucket: The bucket the data will be exported to :param str gcp_conn_id: GCP connection ID :param str amplitude_s3_conn: S3 connection ID :param str amplitude_s3_bucket: The bucket to export data to :param str s3_prefix: The prefix for the s3 objects """ environment = environ['DEPLOY_ENVIRONMENT'] _dag_name = '{}.{}'.format(parent_dag_name, dag_name) with models.DAG(_dag_name, default_args=default_args) as dag: # For now, we assume the view is already updated # See https://github.com/mozilla/bigquery-etl/issues/218 exec_date = '{{ ds }}' # Check that we have data for this date check_sql = ('SELECT COUNT(*) ' 'FROM `{}.{}.{}` ' 'WHERE DATE(submission_timestamp) = "{}"').format( project, dataset, table_or_view, exec_date) wait_for_data = BigQuerySQLSensorOperator(task_id='wait_for_data', sql=check_sql, bigquery_conn_id=gcp_conn_id, use_legacy_sql=False) # Create the table with yesterday's data project_id = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id temp_table_name = table_or_view + '_{{ ds_nodash }}' fully_qualified_table_name = '{}.{}.{}'.format(project_id, dataset, temp_table_name) sql = ('SELECT * EXCEPT (submission_timestamp) ' 'FROM `{}.{}.{}` ' 'WHERE DATE(submission_timestamp) = "{}"').format( project, dataset, table_or_view, exec_date) create_table = BigQueryOperator( task_id='create_temporary_table', sql=sql, destination_dataset_table=fully_qualified_table_name, bigquery_conn_id=gcp_conn_id, use_legacy_sql=False) directory = '/'.join((environment, s3_prefix, '{{ ds_nodash }}')) extension = '.tsv.gz' # Export from bq to gcs # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/bigquery_to_gcs.py#L28 # noqa: E501 gcs_uri = 'gs://{}/{}/*{}'.format(gcs_bucket, directory, extension) table_extract = BigQueryToCloudStorageOperator( task_id='bq_to_gcs', source_project_dataset_table=fully_qualified_table_name, destination_cloud_storage_uris=[gcs_uri], bigquery_conn_id=gcp_conn_id, compression='GZIP', export_format='CSV', field_delimiter='\t', print_header=True) # Push the data to S3 # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/gcs_to_s3.py#L29 # noqa: E501 s3_push = GoogleCloudStorageToS3Operator( task_id='gcs_to_s3', bucket=gcs_bucket, prefix=directory, delimiter=extension, google_cloud_storage_conn_id=gcp_conn_id, dest_aws_conn_id=amplitude_s3_conn, dest_s3_key='s3://{}/'.format(amplitude_s3_bucket), replace=True) # Drop the temporary table table_drop = BigQueryOperator( task_id='drop_temp_table', sql='DROP TABLE `{}`'.format(fully_qualified_table_name), bigquery_conn_id=gcp_conn_id, use_legacy_sql=False) # Delete the GCS data data_delete = GoogleCloudStorageDeleteOperator( task_id='delete_gcs_data', bucket_name=gcs_bucket, prefix=directory, google_cloud_storage_conn_id=gcp_conn_id) wait_for_data >> create_table >> table_extract >> s3_push s3_push >> table_drop s3_push >> data_delete return dag
def subdag_currency_exchange_to_bigquery(parent_dag_name, child_dag_name, execution_date, flow_name, raw_data_filepath, destination_project_dataset_table, schema_fields, bigquery_table_path, final_bigquery_table, args): """ Subdag which does the following: - Parse the raw CSV file to get the dimension currency data & write results to a CSV - Upload the CSV to GCS - Copy data from GCS to BQ - Delete file from GCS :param parent_dag_name: Main DAG name :param child_dag_name: Child DAG name :param execution_date: (str) - Airflow execution date :param flow_name: (str) - Type of flow to execute: - dimension_currency - exchange_rate_history :param raw_data_filepath: (str) - Raw CSV filepath on Local :param destination_project_dataset_table: (str) - BQ table name, - dataset.table :param schema_fields: (list) - BQ table schema :param bigquery_table_path: (str) - BQ table query path :param final_bigquery_table: (str) - BQ table name :param args: Airflow arguments :return: None Note: Modified date: 10-04-2021 Author: TB """ dag = DAG( f"{parent_dag_name}.{child_dag_name}", default_args=args, schedule_interval="@daily", ) # create filename filename = f"{flow_name}_{execution_date}.csv" # 1. extract data from raw csv file & upload to GCS clean_data_to_gcs = FlowToGoogleCloudStorage( task_id="clean_data_to_gcs", flow_name=flow_name, raw_data_filepath=raw_data_filepath, clean_filepath=f"downloads/{filename}", google_cloud_storage_conn_id="airflow_gcp_connection", gcs_bucket="airflow_poc", gcs_filepath=f"{flow_name}.csv", dag=dag) # 2. copy file from gcs to bigquery gcs_to_bq = GoogleCloudStorageToBigQueryOperator( task_id="gcs_to_bq", bucket="airflow_poc", source_objects=[f"{flow_name}.csv"], destination_project_dataset_table=destination_project_dataset_table, schema_fields=schema_fields, write_disposition="WRITE_TRUNCATE", google_cloud_storage_conn_id="airflow_gcp_connection", bigquery_conn_id="airflow_gcp_connection", dag=dag) # 3. delete file from GCS delete_gcs_file = GoogleCloudStorageDeleteOperator( task_id="delete_gcs_file", bucket_name="airflow_poc", objects=[f"{flow_name}.csv"], google_cloud_storage_conn_id="airflow_gcp_connection", dag=dag) clean_data_to_gcs >> gcs_to_bq >> delete_gcs_file if bigquery_table_path and final_bigquery_table: with open(bigquery_table_path, "r") as q: data_query = q.read() # create table in bigquery using SQL query create_bigquery_table = BigQueryOperator( task_id=f"create_bigquery_table", sql=data_query, destination_dataset_table=final_bigquery_table, write_disposition="WRITE_TRUNCATE", bigquery_conn_id="airflow_gcp_connection", use_legacy_sql=False, create_disposition='CREATE_IF_NEEDED', time_partitioning=None, cluster_fields=None, location="EU", dag=dag) clean_data_to_gcs >> gcs_to_bq >> delete_gcs_file gcs_to_bq >> create_bigquery_table return dag
"bin/export-avro.sh", "moz-fx-data-shared-prod", "moz-fx-data-shared-prod:analysis", "gs://moz-fx-data-derived-datasets-parquet-tmp/avro/mozaggregator/mobile", "saved_session_v4", "'nightly', 'beta'", "{{ ds }}", ], docker_image="mozilla/python_mozaggregator:latest", dag=dag, ).set_downstream(mobile_aggregate_view_dataproc) GoogleCloudStorageDeleteOperator( task_id="delete_mobile_metrics_avro", bucket_name="moz-fx-data-derived-datasets-parquet-tmp", prefix="avro/mozaggregator/mobile/moz-fx-data-shared-prod/{{ ds_nodash }}/mobile_metrics_v1", google_cloud_storage_conn_id=gcp_conn.gcp_conn_id, dag=dag ).set_upstream(mobile_aggregate_view_dataproc) GoogleCloudStorageDeleteOperator( task_id="delete_saved_session_avro", bucket_name="moz-fx-data-derived-datasets-parquet-tmp", prefix="avro/mozaggregator/mobile/moz-fx-data-shared-prod/{{ ds_nodash }}/saved_session_v4", google_cloud_storage_conn_id=gcp_conn.gcp_conn_id, dag=dag ).set_upstream(mobile_aggregate_view_dataproc) register_status( mobile_aggregate_view_dataproc, "Mobile Aggregates",
task_id="glam_client_probe_counts_extract", destination_table="glam_client_probe_counts_extract_v1", dataset_id=dataset_id, project_id="moz-fx-data-shared-prod", owner="*****@*****.**", email=[ "*****@*****.**", "*****@*****.**", "*****@*****.**" ], date_partition_parameter=None, arguments=('--replace', ), dag=dag) glam_gcs_delete_old_extracts = GoogleCloudStorageDeleteOperator( task_id="glam_gcs_delete_old_extracts", bucket_name=glam_bucket, prefix="extract-", google_cloud_storage_conn_id=gcp_conn.gcp_conn_id, dag=dag) gcs_destination = "{}/extract-*.csv".format(glam_bucket) glam_extract_to_csv = BigQueryToCloudStorageOperator( task_id="glam_extract_to_csv", source_project_dataset_table="glam_client_probe_counts_extract_v1", destination_cloud_storage_uris=gcs_destination, export_format="CSV", print_header=False, dag=dag) wait_for_main_ping >> latest_versions latest_versions >> clients_daily_scalar_aggregates