def main_summary_subdag_factory(parent_dag, task_id, day): ds = "{{{{ macros.ds_format(macros.ds_add(ds, {0}), '%Y-%m-%d', '%Y%m%d') }}}}".format(day) subdag = DAG("{}.{}".format(parent_dag.dag_id, task_id), schedule_interval=SCHEDULE_INTERVAL, start_date=START_DATE, default_args=default_args) parent_job_flow_id = ("{{{{ task_instance.xcom_pull('setup_backfill_cluster', " "key='return_value', dag_id={}) }}}}".format(parent_dag.dag_id)) # Try to alleviate throttling issues by introducing some slight jitter on each of the days timedelta_task = TimeDeltaSensor( task_id="day_start_jitter", delta=timedelta(seconds=day), dag=subdag ) add_step_task = EmrAddStepsOperator( task_id='submit_main_summary_day', job_flow_id=parent_job_flow_id, execution_timeout=timedelta(minutes=10), aws_conn_id='aws_default', steps=EmrAddStepsOperator.get_step_args( job_name="main_summary {}".format(ds), owner="*****@*****.**", action_on_failure='CONTINUE', uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", env=tbv_envvar("com.mozilla.telemetry.views.MainSummaryView", { "from": ds, "to": ds, "bucket": "telemetry-backfill" }, { "DO_ASSEMBLY": "False" }), ), dag=subdag ) step_sensor_task = EmrStepSensor( task_id="main_summary_step_sensor", timeout=timedelta(hours=10).total_seconds(), job_flow_id=parent_job_flow_id, step_id="{{ task_instance.xcom_pull('submit_main_summary_day', key='return_value') }}", poke_interval=timedelta(minutes=5).total_seconds(), dag=subdag ) step_sensor_task.set_upstream(add_step_task) add_step_task.set_upstream(timedelta_task) return subdag
} }] dag = DAG('dag_name', default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=2), schedule_interval='0 3 * * *') parse_request = PythonOperator(task_id='parse_request', provide_context=True, python_callable=retrieve_s3_file, dag=dag) # Step instructions for the EMR for data processing step_adder = EmrAddStepsOperator(task_id='add_steps', job_flow_id=CLUSTER_ID, aws_conn_id='aws_default', steps=SPARK_TEST_STEPS, dag=dag) step_checker = EmrStepSensor( task_id='watch_step', job_flow_id=CLUSTER_ID, step_id="{{ task_instance.xcom_pull('add_steps', key='return_value')[0] }}", aws_conn_id='aws_default', dag=dag) # Workflow order for the Celery workers step_adder.set_upstream(parse_request) step_checker.set_upstream(step_adder)