'retry_delay': timedelta(minutes=30), } # Make sure all the data for the given day has arrived before running. # Running at 1am should suffice. dag = DAG('main_summary', default_args=default_args, schedule_interval='0 1 * * *', max_active_runs=10) # We copy yesterday's main pings from telemetry_live to telemetry_stable # at the root of this DAG because telemetry_stable.main_v4 will become # the source for main_summary, etc. once we are comfortable retiring parquet # data imports. copy_deduplicate_main_ping = bigquery_etl_copy_deduplicate( task_id="copy_deduplicate_main_ping", target_project_id="moz-fx-data-shared-prod", only_tables=["telemetry_live.main_v4"], parallelism=24, slices=100, owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**", "*****@*****.**"], dag=dag) bq_main_events = bigquery_etl_query( task_id="bq_main_events", project_id="moz-fx-data-shared-prod", destination_table="main_events_v1", dataset_id="telemetry_derived", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], dag=dag, arguments=('--schema_update_option=ALLOW_FIELD_ADDITION',), )
"retries": 1, "retry_delay": datetime.timedelta(minutes=5), } dag_name = "copy_deduplicate" with models.DAG(dag_name, schedule_interval="0 1 * * *", default_args=default_args) as dag: # This single task is responsible for sequentially running copy queries # over all the tables in _live datasets into _stable datasets except those # that are specifically used in another DAG. copy_deduplicate_all = bigquery_etl_copy_deduplicate( task_id="copy_deduplicate_all", target_project_id="moz-fx-data-shared-prod", # Any table listed here under except_tables _must_ have a corresponding # copy_deduplicate job in another DAG. except_tables=["telemetry_live.main_v4"]) # Events. event_events = bigquery_etl_query( task_id="event_events", project_id="moz-fx-data-shared-prod", destination_table="event_events_v1", dataset_id="telemetry_derived", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], arguments=('--schema_update_option=ALLOW_FIELD_ADDITION', ), )
} dag_name = "copy_deduplicate" with models.DAG( dag_name, schedule_interval="0 1 * * *", default_args=default_args) as dag: # This single task is responsible for sequentially running copy queries # over all the tables in _live datasets into _stable datasets except those # that are specifically used in another DAG. copy_deduplicate_all = bigquery_etl_copy_deduplicate( task_id="copy_deduplicate_all", target_project_id="moz-fx-data-shared-prod", priority_weight=100, # Any table listed here under except_tables _must_ have a corresponding # copy_deduplicate job in another DAG. except_tables=["telemetry_live.main_v4"]) copy_deduplicate_main_ping = bigquery_etl_copy_deduplicate( task_id="copy_deduplicate_main_ping", target_project_id="moz-fx-data-shared-prod", only_tables=["telemetry_live.main_v4"], parallelism=24, slices=100, owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**", "*****@*****.**"], priority_weight=100, dag=dag)