def main_summary_subdag_factory(parent_dag, task_id, day): ds = "{{{{ macros.ds_format(macros.ds_add(ds, {0}), '%Y-%m-%d', '%Y%m%d') }}}}".format(day) subdag = DAG("{}.{}".format(parent_dag.dag_id, task_id), schedule_interval=SCHEDULE_INTERVAL, start_date=START_DATE, default_args=default_args) parent_job_flow_id = ("{{{{ task_instance.xcom_pull('setup_backfill_cluster', " "key='return_value', dag_id={}) }}}}".format(parent_dag.dag_id)) # Try to alleviate throttling issues by introducing some slight jitter on each of the days timedelta_task = TimeDeltaSensor( task_id="day_start_jitter", delta=timedelta(seconds=day), dag=subdag ) add_step_task = EmrAddStepsOperator( task_id='submit_main_summary_day', job_flow_id=parent_job_flow_id, execution_timeout=timedelta(minutes=10), aws_conn_id='aws_default', steps=EmrAddStepsOperator.get_step_args( job_name="main_summary {}".format(ds), owner="*****@*****.**", action_on_failure='CONTINUE', uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", env=tbv_envvar("com.mozilla.telemetry.views.MainSummaryView", { "from": ds, "to": ds, "bucket": "telemetry-backfill" }, { "DO_ASSEMBLY": "False" }), ), dag=subdag ) step_sensor_task = EmrStepSensor( task_id="main_summary_step_sensor", timeout=timedelta(hours=10).total_seconds(), job_flow_id=parent_job_flow_id, step_id="{{ task_instance.xcom_pull('submit_main_summary_day', key='return_value') }}", poke_interval=timedelta(minutes=5).total_seconds(), dag=subdag ) step_sensor_task.set_upstream(add_step_task) add_step_task.set_upstream(timedelta_task) return subdag
task_id="main_summary_all_histograms", job_name="Main Summary View - All Histograms", execution_timeout=timedelta(hours=12), instance_count=5, max_instance_count=50, enable_autoscale=True, instance_type="c4.4xlarge", spot_bid_price_percent=50, ebs_volume_count=1, ebs_volume_size=250, env=tbv_envvar("com.mozilla.telemetry.views.MainSummaryView", options={ "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "bucket": "telemetry-backfill", "all_histograms": "", "read-mode": "aligned", "input-partition-multiplier": "400", }, dev_options={ "channel": "nightly", }), dag=dag) main_summary = MozDatabricksSubmitRunOperator( task_id="main_summary", job_name="Main Summary View", execution_timeout=timedelta(hours=4), email=[ "*****@*****.**", "*****@*****.**", "*****@*****.**" ],
"date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.public_output_bucket }}" }, uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/hardware_report.sh", output_visibility="public", dag=dag) cross_sectional = EMRSparkOperator( task_id="cross_sectional", job_name="Cross Sectional View", execution_timeout=timedelta(hours=10), instance_count=30, env=tbv_envvar( "com.mozilla.telemetry.views.CrossSectionalView", { "outName": "v" + DS_WEEKLY, "outputBucket": "{{ task.__class__.private_output_bucket }}" }), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) distribution_viewer = EMRSparkOperator( task_id="distribution_viewer", job_name="Distribution Viewer", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], execution_timeout=timedelta(hours=10), instance_count=5, env={"date": DS_WEEKLY}, uri=
} # Make sure all the data for the given day has arrived before running. # Running at 1am should suffice. dag = DAG('main_summary', default_args=default_args, schedule_interval='0 1 * * *') main_summary = EMRSparkOperator( task_id="main_summary", job_name="Main Summary View", execution_timeout=timedelta(hours=14), instance_count=40, env=tbv_envvar( "com.mozilla.telemetry.views.MainSummaryView", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}" }), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) experiments_error_aggregates = EMRSparkOperator( task_id="experiments_error_aggregates", job_name="Experiments Error Aggregates View", execution_timeout=timedelta(hours=5), instance_count=20, owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], env={ "date": "{{ ds_nodash }}",
execution_timeout=timedelta(hours=12), instance_count=5, max_instance_count=50, enable_autoscale=True, instance_type="c4.4xlarge", spot_bid_price_percent=50, ebs_volume_count=1, ebs_volume_size=250, env=tbv_envvar( "com.mozilla.telemetry.views.MainSummaryView", options={ "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "schema-report-location": "s3://{{ task.__class__.private_output_bucket }}/schema/main_summary/submission_date_s3={{ ds_nodash }}", "bucket": "telemetry-backfill", "all_histograms": "", "read-mode": "aligned", "input-partition-multiplier": "400", }, dev_options={ "channel": "nightly", }), dag=dag) main_summary = EMRSparkOperator( task_id="main_summary", job_name="Main Summary View", execution_timeout=timedelta(hours=14), instance_count=40, env=tbv_envvar(
from operators.emr_spark_operator import EMRSparkOperator from utils.constants import DS_WEEKLY from utils.tbv import tbv_envvar default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2017, 5, 26), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('focus_event_longitudinal', default_args=default_args, schedule_interval='@weekly') focus_event_longitudinal = EMRSparkOperator( task_id="focus_event_longitudinal", job_name="Focus Event Longitudinal View", execution_timeout=timedelta(hours=12), instance_count=10, env = tbv_envvar("com.mozilla.telemetry.views.GenericLongitudinalView", { "to": DS_WEEKLY, "tablename": "telemetry_focus_event_parquet", "output-path": "{{ task.__class__.private_output_bucket }}/focus_event_longitudinal", "num-parquet-files": "30", "ordering-columns": "seq,created"}), uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag)
timedelta(minutes=30), } dag = DAG('longitudinal', default_args=default_args, schedule_interval='@weekly') longitudinal = MozDatabricksSubmitRunOperator( task_id="longitudinal", job_name="Longitudinal View", execution_timeout=timedelta(hours=12), instance_count=16, instance_type="i3.8xlarge", env=tbv_envvar("com.mozilla.telemetry.views.LongitudinalView", { "bucket": "{{ task.__class__.private_output_bucket }}", "to": DS_WEEKLY }, metastore_location="s3://telemetry-parquet/longitudinal"), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) register_status(longitudinal, "Longitudinal", "A 6-month longitudinal view of client history.") addon_recommender = EMRSparkOperator( task_id="addon_recommender", job_name="Train the Addon Recommender", execution_timeout=timedelta(hours=10), instance_count=20, owner="*****@*****.**",
devtools_release_events_to_amplitude = EMRSparkOperator( task_id="devtools_release_events_to_amplitude", job_name="DevTools Release Events to Amplitude", execution_timeout=timedelta(hours=8), instance_count=DEVTOOLS_INSTANCES, dev_instance_count=DEVTOOLS_INSTANCES, email=['*****@*****.**', '*****@*****.**'], owner='*****@*****.**', env=tbv_envvar( "com.mozilla.telemetry.streaming.EventsToAmplitude", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "max_parallel_requests": str( DEVTOOLS_INSTANCES * VCPUS_PER_INSTANCE), "config_file_path": "devtools_release_schemas.json", "url": "https://api.amplitude.com/httpapi", "sample": "0.5", "partition_multiplier": "5" }, artifact_url=get_artifact_url(slug), other={ "KEY_BUCKET": "telemetry-airflow", "KEY_PATH": key_path("devtools"), "DO_EVENTS_TO_AMPLITUDE_SETUP": "True" }), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", start_date=datetime(2018, 12, 4), dag=dag)
'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } # Make sure all the data for the given day has arrived before running. # Running at 1am should suffice. dag = DAG('main_summary', default_args=default_args, schedule_interval='0 1 * * *') main_summary = EMRSparkOperator( task_id="main_summary", job_name="Main Summary View", execution_timeout=timedelta(hours=14), instance_count=40, env=tbv_envvar("com.mozilla.telemetry.views.MainSummaryView", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}"}), uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) experiments_error_aggregates = EMRSparkOperator( task_id="experiments_error_aggregates", job_name="Experiments Error Aggregates View", execution_timeout=timedelta(hours=5), instance_count=20, release_label="emr-5.13.0", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], env={"date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}"}, uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/experiments_error_aggregates_view.sh", dag=dag)
'email': ['*****@*****.**', '*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('sync_view', default_args=default_args, schedule_interval='@daily') sync_view = EMRSparkOperator( task_id="sync_view", job_name="Sync Pings View", execution_timeout=timedelta(hours=10), instance_count=5, env = tbv_envvar("com.mozilla.telemetry.views.SyncView", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}"}), uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) sync_view_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="sync_view_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="sync_summary", dataset_version="v2", gke_cluster_name="bq-load-gke-1", ),
job_flow_overrides=EmrCreateJobFlowSelectiveTemplateOperator. get_jobflow_args(owner="*****@*****.**", instance_count=20, keep_alive=True, job_name="Main Summary Backfill"), templated_job_flow_overrides={ "Name": "Main Summary Backfill {{ ds }}", "Steps": EmrCreateJobFlowSelectiveTemplateOperator.get_step_args( job_name="compile_main_summary", owner="*****@*****.**", uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", env=tbv_envvar(None, options={}, branch="backfill", other={"DO_SUBMIT": "False"})), }, dag=dag) cluster_start_sensor_task = MozEmrClusterStartSensor( task_id="wait_for_cluster", timeout=timedelta(hours=1).total_seconds(), job_flow_id=job_flow_id_template, dag=dag) terminate_job_flow_task = EmrTerminateJobFlowOperator( task_id="terminate_backfill_cluster", aws_conn_id='aws_default', execution_timeout=timedelta(minutes=10), job_flow_id=job_flow_id_template,
'retry_delay': timedelta(minutes=30), } # Make sure all the data for the given day has arrived before running. # Running at 1am should suffice. dag = DAG('first_shutdown_summary', default_args=default_args, schedule_interval='0 1 * * *') first_shutdown_summary = EMRSparkOperator( task_id="first_shutdown_summary", job_name="First Shutdown Summary View", execution_timeout=timedelta(hours=1), instance_count=1, env=tbv_envvar("com.mozilla.telemetry.views.MainSummaryView", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}", "doc-type": "first_shutdown", "read-mode": "aligned", "input-partition-multiplier": "4" }), uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) first_shutdown_summary_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="first_shutdown_summary_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="first_shutdown_summary", dataset_version="v4",
default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2018, 6, 27), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('event_ping_events', default_args=default_args, schedule_interval='0 1 * * *') event_ping_events = EMRSparkOperator( task_id="event_ping_events", job_name="Event Ping Events Dataset", execution_timeout=timedelta(hours=8), instance_count=5, env=tbv_envvar( "com.mozilla.telemetry.streaming.EventPingEvents", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "outputPath": "s3://{{ task.__class__.private_output_bucket }}/" }, artifact_url=url), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag)
slug = "{{ task.__class__.telemetry_streaming_slug }}" url = get_artifact_url(slug) default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2018, 11, 26), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('event_ping_events', default_args=default_args, schedule_interval='0 1 * * *') event_ping_events = EMRSparkOperator( task_id="event_ping_events", job_name="Event Ping Events Dataset", execution_timeout=timedelta(hours=8), instance_count=5, env=tbv_envvar("com.mozilla.telemetry.streaming.EventPingEvents", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "outputPath": "s3://{{ task.__class__.private_output_bucket }}/" }, artifact_url=url), uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag)
# Make sure all the data for the given day has arrived before running. # Running at 1am should suffice. dag = DAG('main_summary', default_args=default_args, schedule_interval='0 1 * * *') main_summary = EMRSparkOperator( task_id="main_summary", job_name="Main Summary View", execution_timeout=timedelta(hours=14), instance_count=40, env=tbv_envvar( "com.mozilla.telemetry.views.MainSummaryView", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "schema-report-location": "s3://{{ task.__class__.private_output_bucket }}/schema/main_summary/submission_date_s3={{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}" }), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) main_summary_schema = EmailSchemaChangeOperator( task_id="main_summary_schema", email=["*****@*****.**", "*****@*****.**"], to=["*****@*****.**"], key_prefix='schema/main_summary/submission_date_s3=', dag=dag) experiments_error_aggregates = EMRSparkOperator(
from utils.constants import DS_WEEKLY from utils.tbv import tbv_envvar default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2018, 10, 21), 'email': ['*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('quantum_release_criteria_view', default_args=default_args, schedule_interval='@weekly') quantum_release_criteria_view = EMRSparkOperator( task_id="quantum_release_criteria_view", job_name="Quantum Release Criteria View", execution_timeout=timedelta(hours=2), instance_count=10, env=tbv_envvar("com.mozilla.telemetry.views.QuantumRCView", { "to": DS_WEEKLY, "bucket": "{{ task.__class__.private_output_bucket }}" }), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag)
task_id="main_summary_all_histograms", job_name="Main Summary View - All Histograms", execution_timeout=timedelta(hours=12), instance_count=5, max_instance_count=50, enable_autoscale=True, instance_type="c4.4xlarge", spot_bid_price_percent=50, ebs_volume_count=1, ebs_volume_size=250, env=tbv_envvar("com.mozilla.telemetry.views.MainSummaryView", options={ "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "bucket": "telemetry-backfill", "all_histograms": "", "read-mode": "aligned", "input-partition-multiplier": "400", }, dev_options={ "channel": "nightly", }), dag=dag) main_summary = MozDatabricksSubmitRunOperator( task_id="main_summary", job_name="Main Summary View", execution_timeout=timedelta(hours=4), email=["*****@*****.**", "*****@*****.**", "*****@*****.**"], instance_count=5, max_instance_count=40, enable_autoscale=True,
'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('sync_view', default_args=default_args, schedule_interval='@daily') sync_view = EMRSparkOperator( task_id="sync_view", job_name="Sync Pings View", execution_timeout=timedelta(hours=10), instance_count=5, env=tbv_envvar( "com.mozilla.telemetry.views.SyncView", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}" }), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) sync_view_bigquery_load = SubDagOperator(subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="sync_view_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="sync_summary", dataset_version="v2", gke_cluster_name="bq-load-gke-1",
'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } # Make sure all the data for the given day has arrived before running. # Running at 1am should suffice. dag = DAG('first_shutdown_summary', default_args=default_args, schedule_interval='0 1 * * *') first_shutdown_summary = EMRSparkOperator( task_id="first_shutdown_summary", job_name="First Shutdown Summary View", execution_timeout=timedelta(hours=1), instance_count=1, env=tbv_envvar( "com.mozilla.telemetry.views.MainSummaryView", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}", "doc-type": "first_shutdown", "read-mode": "aligned", "input-partition-multiplier": "4" }), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag)
'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('longitudinal', default_args=default_args, schedule_interval='@weekly') longitudinal = MozDatabricksSubmitRunOperator( task_id="longitudinal", job_name="Longitudinal View", execution_timeout=timedelta(hours=12), instance_count=16, instance_type="i3.8xlarge", env=tbv_envvar( "com.mozilla.telemetry.views.LongitudinalView", { "bucket": "{{ task.__class__.private_output_bucket }}", "to": DS_WEEKLY }, metastore_location="s3://telemetry-parquet/longitudinal"), uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) register_status(longitudinal, "Longitudinal", "A 6-month longitudinal view of client history.") game_hw_survey = EMRSparkOperator( task_id="game_hw_survey", job_name="Firefox Hardware Report", execution_timeout=timedelta(hours=5), instance_count=15, owner="*****@*****.**",
default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2018, 11, 20), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 3, 'retry_delay': timedelta(minutes=30), } dag = DAG('crash_summary', default_args=default_args, schedule_interval='@daily') crash_summary_view = EMRSparkOperator( task_id="crash_summary_view", job_name="Crash Summary View", instance_count=20, execution_timeout=timedelta(hours=4), env=tbv_envvar( "com.mozilla.telemetry.views.CrashSummaryView", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "outputBucket": "{{ task.__class__.private_output_bucket }}" }), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag)
default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2016, 9, 20), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 3, 'retry_delay': timedelta(minutes=30), } dag = DAG('crash_aggregates_backfill', default_args=default_args, schedule_interval='@daily') crash_aggregates_view_backfill = EMRSparkOperator( task_id="crash_aggregates_view_backfill", job_name="Crash Aggregates View Backfill", instance_count=20, execution_timeout=timedelta(hours=4), env=tbv_envvar( "com.mozilla.telemetry.views.CrashAggregateView", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "bucket": "telemetry-test-bucket" }), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag)