dag_name="taar_lite", default_args=default_args, cluster_name=taarlite_cluster_name, job_name="TAAR_Lite_GUID_GUID", python_driver_code= "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/taar_lite_guidguid.py", # python_driver_code="gs://temp-hwoo-removemelater/taar_lite_guidguid.py", num_workers=8, py_args=[ "--date", "{{ ds_nodash }}", "--aws_access_key_id", aws_access_key, "--aws_secret_access_key", aws_secret_key, ], aws_conn_id=aws_conn_id, gcp_conn_id=gcpdataproc_conn_id, ), dag=dag, ) # Set a dependency on amodump from amowhitelist amowhitelist.set_upstream(amodump) # Set a dependency on amodump for the editorial reviewed whitelist of # addons editorial_whitelist.set_upstream(amodump) # Set a dependency on amowhitelist from taar_lite taar_lite.set_upstream(amowhitelist)
], env_vars={ "AWS_ACCESS_KEY_ID": aws_access_key, "AWS_SECRET_ACCESS_KEY": aws_secret_key }, dag=dag) schema_generator = GKEPodOperator( email=['*****@*****.**'], task_id='mozilla_schema_generator', gcp_conn_id=gcp_conn_id, project_id=connection.project_id, location=gke_location, cluster_name=gke_cluster_name, name='schema-generator-1', namespace='default', image='mozilla/mozilla-schema-generator:latest', is_delete_operator_pod=True, image_pull_policy='Always', env_vars={ "MPS_SSH_KEY_BASE64": "{{ var.value.mozilla_pipeline_schemas_secret_git_sshkey_b64 }}", "MPS_REPO_URL": "[email protected]:mozilla-services/mozilla-pipeline-schemas.git", "MPS_BRANCH_SOURCE": "master", "MPS_BRANCH_PUBLISH": "generated-schemas", }, dag=dag) schema_generator.set_upstream(probe_scraper)
dag=dag) schema_generator = GKEPodOperator( email=['*****@*****.**'], task_id='mozilla_schema_generator', name='schema-generator-1', image='mozilla/mozilla-schema-generator:latest', env_vars={ "MPS_SSH_KEY_BASE64": "{{ var.value.mozilla_pipeline_schemas_secret_git_sshkey_b64 }}", "MPS_REPO_URL": "[email protected]:mozilla-services/mozilla-pipeline-schemas.git", "MPS_BRANCH_SOURCE": "master", "MPS_BRANCH_PUBLISH": "generated-schemas", }, dag=dag) schema_generator.set_upstream(probe_scraper) probe_expiry_alerts = GKEPodOperator( task_id="probe-expiry-alerts", name="probe-expiry-alerts", image=probe_scraper_image, arguments=[ "python3", "-m", "probe_scraper.probe_expiry_alert", "--date", "{{ ds }}", "--bugzilla-api-key", "{{ var.value.bugzilla_probe_expiry_bot_api_key }}" ], email=["*****@*****.**"], env_vars={ "AWS_ACCESS_KEY_ID": aws_access_key, "AWS_SECRET_ACCESS_KEY": aws_secret_key },
dag=dag, ) wait_for_bq_events = ExternalTaskSensor( task_id="wait_for_bq_events", external_dag_id="copy_deduplicate", external_task_id="bq_main_events", execution_delta=timedelta(hours=3), mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR", dag=dag, ) wait_for_copy_deduplicate_events = ExternalTaskSensor( task_id="wait_for_copy_deduplicate_events", external_dag_id="copy_deduplicate", external_task_id="event_events", execution_delta=timedelta(hours=3), mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR", dag=dag, ) jetstream.set_upstream([ wait_for_clients_daily_export, wait_for_main_summary_export, wait_for_search_clients_daily, wait_for_bq_events, wait_for_copy_deduplicate_events, ])
"*****@*****.**", "*****@*****.**", ], arguments=["{{ds}}"] + ["--spreadsheet-id=" + Variable.get('anomdtct_spreadsheet_id')] + ["--spreadsheet-key=" + Variable.get('anomdtct_spreadsheet_api_key')], dag=dag, ) wait_for_clients_first_seen = ExternalTaskSensor( task_id="wait_for_clients_first_seen", external_dag_id="main_summary", external_task_id="clients_first_seen", dag=dag, ) anomdtct.set_upstream([ wait_for_clients_first_seen, ]) deviations = bigquery_etl_query( task_id="deviations", project_id="moz-fx-data-shared-prod", destination_table="deviations_v1", dataset_id="telemetry_derived", arguments=("--replace", ), dag=dag, ) deviations.set_upstream(anomdtct)
image=docker_image, dag=dag, ) wait_for_telemetry_derived__ssl_ratios__v1 = ExternalTaskSensor( task_id="wait_for_telemetry_derived__ssl_ratios__v1", external_dag_id="bqetl_ssl_ratios", external_task_id="telemetry_derived__ssl_ratios__v1", execution_delta=datetime.timedelta(seconds=7200), check_existence=True, mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR", ) export_public_data_json_telemetry_derived__ssl_ratios__v1.set_upstream( wait_for_telemetry_derived__ssl_ratios__v1 ) public_data_gcs_metadata = gke_command( task_id="public_data_gcs_metadata", command=["script/publish_public_data_gcs_metadata"], docker_image=docker_image, dag=dag, ) public_data_gcs_metadata.set_upstream( [ export_public_data_json_telemetry_derived__ssl_ratios__v1, ] )
["--project_id=moz-fx-data-shared-prod"] + ["--parameter=submission_date:DATE:{{ds}}"], image=docker_image, dag=dag, ) wait_for_mozregression_aggregates__v1 = ExternalTaskSensor( task_id="wait_for_mozregression_aggregates__v1", external_dag_id="bqetl_internal_tooling", external_task_id="mozregression_aggregates__v1", check_existence=True, mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR", ) export_public_data_json_mozregression_aggregates__v1.set_upstream( wait_for_mozregression_aggregates__v1) wait_for_telemetry_derived__ssl_ratios__v1 = ExternalTaskSensor( task_id="wait_for_telemetry_derived__ssl_ratios__v1", external_dag_id="bqetl_ssl_ratios", external_task_id="telemetry_derived__ssl_ratios__v1", execution_delta=datetime.timedelta(seconds=7200), check_existence=True, mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR", ) export_public_data_json_telemetry_derived__ssl_ratios__v1.set_upstream( wait_for_telemetry_derived__ssl_ratios__v1) public_data_gcs_metadata = gke_command(
wait_for_bq_events = ExternalTaskSensor( task_id="wait_for_bq_events", external_dag_id="copy_deduplicate", external_task_id="bq_main_events", execution_delta=timedelta(hours=3), mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR", email_on_retry=False, dag=dag, ) wait_for_copy_deduplicate_events = ExternalTaskSensor( task_id="wait_for_copy_deduplicate_events", external_dag_id="copy_deduplicate", external_task_id="event_events", execution_delta=timedelta(hours=3), mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR", email_on_retry=False, dag=dag, ) jetstream_run.set_upstream([ wait_for_clients_daily_export, wait_for_main_summary_export, wait_for_search_clients_daily, wait_for_bq_events, wait_for_copy_deduplicate_events, ]) jetstream_config_changed.set_upstream(jetstream_run)