def topline_dag(dag, mode, instance_count): topline_summary = EMRSparkOperator( task_id="topline_summary", job_name="Topline Summary View", execution_timeout=timedelta(hours=8), instance_count=instance_count, env={ "date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}", "mode": mode }, uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/topline_summary_view.sh", dag=dag) topline_dashboard = EMRSparkOperator( task_id="topline_dashboard", job_name="Topline Dashboard", execution_timeout=timedelta(hours=2), instance_count=1, env={"mode": mode}, uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/topline_dashboard.sh", dag=dag) topline_dashboard.set_upstream(topline_summary)
def add_search_rollup(dag, mode, instance_count, upstream=None): """Create a search rollup for a particular date date This can be called with an optional task passed into `upstream`. The rollup job will inherit the default values of the referenced DAG. """ search_rollup = EMRSparkOperator( task_id="search_rollup_{}".format(mode), job_name="{} search rollup".format(mode).title(), owner="*****@*****.**", email=[ '*****@*****.**', '*****@*****.**', '*****@*****.**', ], execution_timeout=timedelta(hours=4), instance_count=instance_count, disable_on_dev=True, env=mozetl_envvar( "search_rollup", { "start_date": "{{ ds_nodash }}", "mode": mode, "bucket": "net-mozaws-prod-us-west-2-pipeline-analysis", "prefix": "spenrose/search/to_vertica", }), uri= "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh", dag=dag) if upstream: search_rollup.set_upstream(upstream)
def add_search_rollup(dag, mode, instance_count, upstream=None): """Create a search rollup for a particular date date""" search_rollup = EMRSparkOperator( task_id="search_rollup_{}".format(mode), job_name="{} search rollup".format(mode).title(), execution_timeout=timedelta(hours=4), instance_count=instance_count, env=mozetl_envvar("search_rollup", { "start_date": "{{ ds_nodash }}", "mode": mode, "bucket": "net-mozaws-prod-us-west-2-pipeline-analysis", "prefix": "spenrose/search/to_vertica", }), uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh", dag=dag ) if upstream: search_rollup.set_upstream(upstream)
def topline_dag(dag, mode, instance_count): topline_summary = EMRSparkOperator( task_id="topline_summary", job_name="Topline Summary View", execution_timeout=timedelta(hours=8), instance_count=instance_count, env={ "date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}", "mode": mode }, uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/topline_summary_view.sh", dag=dag) topline_dashboard = EMRSparkOperator( task_id="topline_dashboard", job_name="Topline Dashboard", execution_timeout=timedelta(hours=2), instance_count=1, env={"mode": mode}, uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/topline_dashboard.sh", dag=dag) topline_dashboard.set_upstream(topline_summary)
from airflow import DAG from datetime import datetime, timedelta from operators.emr_spark_operator import EMRSparkOperator default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2017, 1, 1), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } # Make sure all the data for the given day has arrived before running. # Running at 1am should suffice. dag = DAG('sync_log', default_args=default_args, schedule_interval='0 1 * * *') t0 = EMRSparkOperator(task_id="sync_log", job_name="Sync Log Import", execution_timeout=timedelta(hours=10), release_label="emr-5.0.0", instance_count=10, env={"date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}"}, uri="https://raw.githubusercontent.com/mozilla/mozilla-reports/master/etl/sync_log.kp/orig_src/ImportSyncLogs.ipynb", dag=dag)
'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 3, 'retry_delay': timedelta(minutes=30), } dag = DAG('taar_amodump', default_args=default_args, schedule_interval='@daily') amodump = EMRSparkOperator( task_id="taar_amodump", job_name="Dump AMO JSON blobs with oldest creation date", execution_timeout=timedelta(hours=1), instance_count=1, owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], env=mozetl_envvar("taar_amodump", {"date": "{{ ds_nodash }}"}, {'MOZETL_SUBMISSION_METHOD': 'python'}), uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh", output_visibility="private", dag=dag ) amowhitelist = EMRSparkOperator( task_id="taar_amowhitelist", job_name="Generate an algorithmically defined set of whitelisted addons for TAAR", execution_timeout=timedelta(hours=1), instance_count=1, owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], env=mozetl_envvar("taar_amowhitelist",
"retries": 2, "retry_delay": timedelta(minutes=30), } dag = DAG("socorro_import", default_args=default_args, schedule_interval="@daily") # input: crashstats-telemetry-crashes-prod-us-west-2/v1/crash_report # output: telemetry-parquet/socorro_crash/v2 crash_report_parquet = EMRSparkOperator( task_id="crash_report_parquet", job_name="Socorro Crash Reports Parquet", execution_timeout=timedelta(hours=4), instance_count=10, env={"date": "{{ ds_nodash }}"}, uri= "https://raw.githubusercontent.com/mozilla-services/data-pipeline/master/reports/socorro_import/ImportCrashData.ipynb", output_visibility="public", dag=dag, ) register_status( crash_report_parquet, crash_report_parquet.job_name, "Convert processed crash reports into parquet for analysis", ) crash_report_parquet_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id,
'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('fx_usage_report', default_args=default_args, schedule_interval='@weekly') wait_for_main_summary = ExternalTaskSensor( task_id='wait_for_main_summary', external_dag_id='main_summary', external_task_id='main_summary', execution_delta=timedelta(days=-7, hours=-1), # main_summary waits one hour, execution date is beginning of the week dag=dag) usage_report = EMRSparkOperator( task_id="fx_usage_report", job_name="Fx Usage Report", execution_timeout=timedelta(hours=4), instance_count=10, owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**", "*****@*****.**"], env={"date": DS_WEEKLY, "bucket": "net-mozaws-prod-us-west-2-data-public", "deploy_environment": "{{ task.__class__.deploy_environment }}"}, uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/fx_usage_report.sh", dag=dag) usage_report.set_upstream(wait_for_main_summary)
aws_conn_id="aws_dev_iam_s3", dataset="main_summary", dataset_version="v4", gke_cluster_name="bq-load-gke-1", ), task_id="main_summary_bigquery_load", dag=dag) engagement_ratio = EMRSparkOperator( task_id="engagement_ratio", job_name="Update Engagement Ratio", execution_timeout=timedelta(hours=6), instance_count=10, env=mozetl_envvar("engagement_ratio", options={ "input_bucket": "{{ task.__class__.private_output_bucket }}", "output_bucket": "net-mozaws-prod-us-west-2-pipeline-analysis" }, dev_options={ "output_bucket": "{{ task.__class__.private_output_bucket }}" }), uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh", output_visibility="public", dag=dag) addons = EMRSparkOperator( task_id="addons", job_name="Addons View", execution_timeout=timedelta(hours=4), instance_count=3, env=tbv_envvar("com.mozilla.telemetry.views.AddonsView", { "from": "{{ ds_nodash }}",
'depends_on_past': False, 'start_date': datetime(2099, 5, 31), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 3, 'retry_delay': timedelta(minutes=10), } dag = DAG('example', default_args=default_args, schedule_interval='@daily') spark = EMRSparkOperator( task_id="spark", job_name="Spark Example Job", instance_count=1, execution_timeout=timedelta(hours=4), env={"date": "{{ ds_nodash }}"}, uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/examples/spark/example_date.ipynb", dag=dag) bash = EMRSparkOperator( task_id="bash", job_name="Bash Example Job", instance_count=1, execution_timeout=timedelta(hours=4), env={"date": "{{ ds_nodash }}"}, uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/examples/spark/example_date.sh", dag=dag)
'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('churn', default_args=default_args, schedule_interval='0 0 * * 3') churn = EMRSparkOperator( task_id="churn", job_name="churn 7-day v3", execution_timeout=timedelta(hours=4), instance_count=5, env=mozetl_envvar( "churn", { "start_date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}" }), uri= "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh", output_visibility="public", dag=dag) churn_v2 = EMRSparkOperator( task_id="churn_v2", job_name="churn 7-day v2", execution_timeout=timedelta(hours=4), instance_count=5, env=mozetl_envvar( "churn", { "start_date": "{{ ds_nodash }}",
from datetime import datetime, timedelta from operators.emr_spark_operator import EMRSparkOperator default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2017, 1, 1), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } # Make sure all the data for the given day has arrived before running. # Running at 1am should suffice. dag = DAG('sync_log', default_args=default_args, schedule_interval='0 1 * * *') sync_log = EMRSparkOperator( task_id="sync_log", job_name="Sync Log Import", execution_timeout=timedelta(hours=10), instance_count=10, env={ "date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}" }, uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/ImportSyncLogs.ipynb", dag=dag)
dag=dag) churn_v2_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="churn_v2_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="churn", dataset_version="v2", date_submission_col="week_start", gke_cluster_name="bq-load-gke-1", ), task_id="churn_v2_bigquery_load", dag=dag) churn_to_csv = EMRSparkOperator( task_id="churn_to_csv", job_name="Convert Churn v2 to csv", execution_timeout=timedelta(hours=4), instance_count=1, env=mozetl_envvar("churn_to_csv", {"start_date": "{{ ds_nodash }}"}), uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh", dag=dag) churn_bigquery_load.set_upstream(churn) churn_to_csv.set_upstream(churn_v2) churn_v2_bigquery_load.set_upstream(churn_v2)
from operators.emr_spark_operator import EMRSparkOperator from utils.constants import DS_WEEKLY default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2017, 5, 26), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('core_client_count', default_args=default_args, schedule_interval='@weekly') core_client_count_view = EMRSparkOperator( task_id="core_client_count_view", job_name="Core Client Count View", execution_timeout=timedelta(hours=4), instance_count=20, env={ "date": DS_WEEKLY, "bucket": "{{ task.__class__.private_output_bucket }}" }, uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/core_client_count_view.sh", dag=dag)
"to": DS_WEEKLY }, metastore_location="s3://telemetry-parquet/longitudinal"), uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) register_status(longitudinal, "Longitudinal", "A 6-month longitudinal view of client history.") game_hw_survey = EMRSparkOperator( task_id="game_hw_survey", job_name="Firefox Hardware Report", execution_timeout=timedelta(hours=5), instance_count=15, owner="*****@*****.**", depends_on_past=True, email=["*****@*****.**", "*****@*****.**", "*****@*****.**"], env={"date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.public_output_bucket }}"}, uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/hardware_report.sh", output_visibility="public", dag=dag) taar_lite_guidranking = EMRSparkOperator( task_id="taar_lite_guidranking", job_name="TAARlite Addon Ranking", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], execution_timeout=timedelta(hours=2), instance_count=4,
'email_on_failure': True, 'email_on_retry': True, 'retries': 3, 'retry_delay': timedelta(minutes=30), } try: dag = DAG('bugzilla_dataset', default_args=default_args, schedule_interval='@daily') connection_details = BaseHook.get_connection('bugzilla_db') env = { "DATABASE_USER": connection_details.login, "DATABASE_PASSWORD": connection_details.password, "DATABASE_HOST": connection_details.host, "DATABASE_PORT": connection_details.port, "DATABASE_NAME": connection_details.schema, } update_bugs = EMRSparkOperator( task_id="update_bugs", job_name="Bugzilla Dataset Update", execution_timeout=timedelta(hours=5), instance_count=1, env=env, uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/bugzilla_dataset.sh", dag=dag ) except AirflowException: pass
'email_on_failure': True, 'email_on_retry': True, 'retries': 3, 'retry_delay': timedelta(minutes=30), } dag = DAG('mobile_aggregates', default_args=default_args, schedule_interval='@daily') mobile_aggregate_view = EMRSparkOperator( task_id="mobile_aggregate_view", job_name="Mobile Aggregate View", instance_count=5, execution_timeout=timedelta(hours=12), env={ "date": "{{ ds_nodash }}", "channels": "nightly", "bucket": "{{ task.__class__.private_output_bucket }}", }, uri=("https://raw.githubusercontent.com/" "mozilla/telemetry-airflow/master/jobs/run_mobile_aggregator.sh"), dag=dag) register_status(mobile_aggregate_view, 'Mobile Aggregates', 'Aggregates of metrics sent through the mobile-events pings.')
from datetime import datetime, timedelta from operators.emr_spark_operator import EMRSparkOperator default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2016, 9, 20), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('android_events', default_args=default_args, schedule_interval='@daily') t0 = EMRSparkOperator( task_id="android_events", job_name="Update android events", execution_timeout=timedelta(hours=4), instance_count=5, owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], env={"date": "{{ ds_nodash }}"}, uri= "https://raw.githubusercontent.com/mozilla-services/data-pipeline/master/reports/android-events/android-events.ipynb", output_visibility="public", dag=dag)
'start_date': datetime(2016, 6, 30), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('longitudinal', default_args=default_args, schedule_interval='@weekly') t0 = EMRSparkOperator(task_id="longitudinal", job_name="Longitudinal View", execution_timeout=timedelta(hours=10), instance_count=30, env={"date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.airflow_bucket }}"}, uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/longitudinal_view.sh", dag=dag) t1 = EMRSparkOperator(task_id="update_orphaning", job_name="Update Orphaning View", execution_timeout=timedelta(hours=10), instance_count=1, owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**", "*****@*****.**"], env={"date": "{{ ds_nodash }}"}, uri="https://raw.githubusercontent.com/mozilla-services/data-pipeline/master/reports/update-orphaning/Update%20orphaning%20analysis%20using%20longitudinal%20dataset.ipynb", dag=dag) t1.set_upstream(t0)
2, 'retry_delay': timedelta(minutes=30), } dag = DAG('longitudinal', default_args=default_args, schedule_interval='@weekly') longitudinal = EMRSparkOperator( task_id="longitudinal", job_name="Longitudinal View", execution_timeout=timedelta(hours=12), instance_count=40, release_label="emr-5.11.0", env={ "date": DS_WEEKLY, "bucket": "{{ task.__class__.private_output_bucket }}" }, uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/longitudinal_view.sh", dag=dag) addon_recommender = EMRSparkOperator( task_id="addon_recommender", job_name="Train the Addon Recommender", execution_timeout=timedelta(hours=10), instance_count=20, owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], env={
'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('main_summary', default_args=default_args, schedule_interval='@daily', max_active_runs=10) # Make sure all the data for the given day has arrived before running. t0 = BashOperator(task_id="delayed_start", bash_command="sleep 1800", dag=dag) t1 = EMRSparkOperator( task_id="main_summary", job_name="Main Summary View", execution_timeout=timedelta(hours=10), instance_count=10, env={ "date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}" }, uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/main_summary_view.sh", dag=dag) # Wait a little while after midnight to start for a given day. t1.set_upstream(t0)
email=["*****@*****.**", "*****@*****.**"], env=mozetl_envvar("taar_amodump", {"date": "{{ ds_nodash }}"}, {'MOZETL_SUBMISSION_METHOD': 'python'}), uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh", output_visibility="private", dag=dag ) amowhitelist = EMRSparkOperator( task_id="taar_amowhitelist", job_name="Generate a whitelisted set of addons for TAAR", execution_timeout=timedelta(hours=1), instance_count=1, owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], env=mozetl_envvar("taar_amowhitelist", {}, {'MOZETL_SUBMISSION_METHOD': 'spark'}), uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh", output_visibility="private", dag=dag ) taar_lite = EMRSparkOperator( task_id="taar_lite", job_name="Generate GUID coinstallation JSON for TAAR", instance_count=5, execution_timeout=timedelta(hours=4), owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], env=mozetl_envvar("taar_lite",
'retry_delay': timedelta(minutes=30), 'bootstrap_args': ['--metrics-provider', 'datadog'], } dag = DAG('events_to_amplitude', default_args=default_args, schedule_interval='0 1 * * *') focus_events_to_amplitude = EMRSparkOperator( task_id="focus_android_events_to_amplitude", job_name="Focus Android Events to Amplitude", execution_timeout=timedelta(hours=8), instance_count=FOCUS_ANDROID_INSTANCES, env={ "date": "{{ ds_nodash }}", "max_requests": FOCUS_ANDROID_INSTANCES * VCPUS_PER_INSTANCE, "key_file": key_file("focus_android"), "artifact": get_artifact_url(slug, branch="master"), "config_filename": "focus_android_events_schemas.json", }, uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/events_to_amplitude.sh", dag=dag) devtools_prerelease_events_to_amplitude = EMRSparkOperator( task_id="devtools_prerelease_events_to_amplitude", job_name="DevTools Prerelease Events to Amplitude", execution_timeout=timedelta(hours=8), instance_count=DEVTOOLS_INSTANCES, email=['*****@*****.**', '*****@*****.**'], owner='*****@*****.**',
from airflow import DAG from datetime import datetime, timedelta from operators.emr_spark_operator import EMRSparkOperator default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2016, 7, 1), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('addons', default_args=default_args, schedule_interval='@daily') t0 = EMRSparkOperator( task_id="addons", job_name="Addons View", execution_timeout=timedelta(hours=4), release_label="emr-5.0.0", instance_count=10, env={ "date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}" }, uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/addons_view.sh", dag=dag)
'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } # Make sure all the data for the given day has arrived before running. # Running at 1am should suffice. dag = DAG('first_shutdown_summary', default_args=default_args, schedule_interval='0 1 * * *') first_shutdown_summary = EMRSparkOperator( task_id="first_shutdown_summary", job_name="First Shutdown Summary View", execution_timeout=timedelta(hours=1), instance_count=1, env=tbv_envvar( "com.mozilla.telemetry.views.MainSummaryView", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}", "doc-type": "first_shutdown", "read-mode": "aligned", "input-partition-multiplier": "4" }), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag)
from operators.emr_spark_operator import EMRSparkOperator default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2018, 11, 20), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('client_count', default_args=default_args, schedule_interval='@daily') client_count_view = EMRSparkOperator( task_id="client_count_view", job_name="Client Count View", execution_timeout=timedelta(hours=10), owner="*****@*****.**", instance_count=20, env={ "date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}" }, uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/client_count_view.sh", dag=dag)
from airflow import DAG from datetime import datetime, timedelta from operators.emr_spark_operator import EMRSparkOperator default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2018, 11, 26), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('mobile_clients', default_args=default_args, schedule_interval='@daily') mobile_clients = EMRSparkOperator( task_id="mobile_clients", job_name="Update mobile clients", execution_timeout=timedelta(hours=8), instance_count=10, owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], env={"date": "{{ ds_nodash }}"}, uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/mobile-clients.ipynb", output_visibility="public", dag=dag)
from airflow import DAG from datetime import datetime, timedelta from operators.emr_spark_operator import EMRSparkOperator default_args = { 'owner': '*****@*****.**', 'depends_on_past': True, 'start_date': datetime(2016, 6, 29), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 3, 'retry_delay': timedelta(minutes=30), } dag = DAG('telemetry_aggregates', default_args=default_args, schedule_interval='@daily') telemetry_aggregate_view = EMRSparkOperator( task_id="telemetry_aggregate_view", job_name="Telemetry Aggregate View", owner="*****@*****.**", instance_count=10, execution_timeout=timedelta(hours=12), env={"date": "{{ ds_nodash }}"}, uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_aggregator.py", dag=dag)
'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('fx_usage_report', default_args=default_args, schedule_interval='@weekly') wait_for_main_summary = ExternalTaskSensor( task_id='wait_for_main_summary', external_dag_id='main_summary', external_task_id='main_summary', execution_delta=timedelta(days=-7, hours=-1), # main_summary waits one hour, execution date is beginning of the week dag=dag) usage_report = EMRSparkOperator( task_id="fx_usage_report", job_name="Fx Usage Report", execution_timeout=timedelta(hours=4), instance_count=10, release_label="emr-5.11.0", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**", "*****@*****.**"], env={"date": DS_WEEKLY, "bucket": "{{ task.__class__.public_output_bucket }}", "deploy_environment": "{{ task.__class__.deploy_environment }}"}, uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/fx_usage_report.sh", dag=dag) usage_report.set_upstream(wait_for_main_summary)
rename={"submission_date_s3": "submission_date"}, replace=["SAFE_CAST(sample_id AS INT64) AS sample_id"], ), task_id="main_summary_bigquery_load", dag=dag) engagement_ratio = EMRSparkOperator( task_id="engagement_ratio", job_name="Update Engagement Ratio", execution_timeout=timedelta(hours=6), instance_count=10, env=mozetl_envvar("engagement_ratio", options={ "input_bucket": "{{ task.__class__.private_output_bucket }}", "output_bucket": "net-mozaws-prod-us-west-2-pipeline-analysis" }, dev_options={ "output_bucket": "{{ task.__class__.private_output_bucket }}" }), uri= "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh", output_visibility="public", dag=dag) addons = EMRSparkOperator( task_id="addons", job_name="Addons View", execution_timeout=timedelta(hours=4), instance_count=3,
from datetime import datetime, timedelta from operators.emr_spark_operator import EMRSparkOperator default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2017, 1, 30), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 3, 'retry_delay': timedelta(minutes=30), } dag = DAG('crash_summary', default_args=default_args, schedule_interval='@daily') crash_summary_view = EMRSparkOperator( task_id="crash_summary_view", job_name="Crash Summary View", instance_count=20, execution_timeout=timedelta(hours=4), env={ "date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}" }, uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/crash_summary_view.sh", dag=dag)
2, 'retry_delay': timedelta(minutes=30), } dag = DAG('longitudinal', default_args=default_args, schedule_interval='@weekly') t0 = EMRSparkOperator( task_id="longitudinal", job_name="Longitudinal View", execution_timeout=timedelta(hours=10), release_label="emr-5.0.0", instance_count=30, env={ "date": DS_WEEKLY, "bucket": "{{ task.__class__.private_output_bucket }}" }, uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/longitudinal_view.sh", dag=dag) t1 = EMRSparkOperator( task_id="update_orphaning", job_name="Update Orphaning View", execution_timeout=timedelta(hours=10), instance_count=1, owner="*****@*****.**", email=[ "*****@*****.**", "*****@*****.**",
uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) register_status(longitudinal, "Longitudinal", "A 6-month longitudinal view of client history.") addon_recommender = EMRSparkOperator( task_id="addon_recommender", job_name="Train the Addon Recommender", execution_timeout=timedelta(hours=10), instance_count=20, owner="*****@*****.**", email=[ "*****@*****.**", "*****@*****.**", "*****@*****.**" ], env={ "date": DS_WEEKLY, "privateBucket": "{{ task.__class__.private_output_bucket }}", "publicBucket": "{{ task.__class__.public_output_bucket }}" }, uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/addon_recommender.sh", dag=dag) game_hw_survey = EMRSparkOperator( task_id="game_hw_survey", job_name="Firefox Hardware Report", execution_timeout=timedelta(hours=5), instance_count=15, owner="*****@*****.**",
} # Make sure all the data for the given day has arrived before running. # Running at 1am should suffice. dag = DAG('main_summary', default_args=default_args, schedule_interval='0 1 * * *') main_summary = EMRSparkOperator( task_id="main_summary", job_name="Main Summary View", execution_timeout=timedelta(hours=14), instance_count=40, env=tbv_envvar( "com.mozilla.telemetry.views.MainSummaryView", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "schema-report-location": "s3://{{ task.__class__.private_output_bucket }}/schema/main_summary/submission_date_s3={{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}" }), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) main_summary_schema = EmailSchemaChangeOperator( task_id="main_summary_schema", email=["*****@*****.**", "*****@*****.**"], to=["*****@*****.**"], key_prefix='schema/main_summary/submission_date_s3=', dag=dag)
default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2017, 3, 26), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag_daily = DAG('probe_scraper', default_args=default_args, schedule_interval='@daily') probe_scraper = EMRSparkOperator( task_id="probe_scraper", job_name="Probe Scraper", execution_timeout=timedelta(hours=4), instance_count=1, owner="*****@*****.**", email=[ '*****@*****.**', '*****@*****.**', '*****@*****.**' ], env={}, uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/probe_scraper.sh", output_visibility="public", dag=dag_daily)
default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2016, 9, 20), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('android_addons', default_args=default_args, schedule_interval='@daily') android_addons = EMRSparkOperator( task_id="android_addons", job_name="Update android addons", execution_timeout=timedelta(hours=4), instance_count=5, owner="*****@*****.**", email=[ "*****@*****.**", "*****@*****.**", "*****@*****.**" ], env={"date": "{{ ds_nodash }}"}, uri= "https://raw.githubusercontent.com/mozilla/mozilla-reports/master/etl/android-addons.kp/orig_src/android-addons.ipynb", output_visibility="public", dag=dag)
default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2018, 11, 20), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 3, 'retry_delay': timedelta(minutes=30), } dag = DAG('crash_summary', default_args=default_args, schedule_interval='@daily') crash_summary_view = EMRSparkOperator( task_id="crash_summary_view", job_name="Crash Summary View", instance_count=20, execution_timeout=timedelta(hours=4), env=tbv_envvar( "com.mozilla.telemetry.views.CrashSummaryView", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "outputBucket": "{{ task.__class__.private_output_bucket }}" }), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag)
default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="sync_flat_summary", dataset_version="v1", gke_cluster_name="bq-load-gke-1", ), task_id="sync_flat_view_bigquery_load", dag=dag) sync_bookmark_validation = EMRSparkOperator( task_id="sync_bookmark_validation", job_name="Sync Bookmark Validation", execution_timeout=timedelta(hours=2), instance_count=1, email=["*****@*****.**", "*****@*****.**"], env=mozetl_envvar("sync_bookmark_validation", { "start_date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}", }), uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh", dag=dag) sync_bookmark_validation.set_upstream(sync_view) sync_view_bigquery_load.set_upstream(sync_view) sync_events_view_bigquery_load.set_upstream(sync_events_view) sync_flat_view_bigquery_load.set_upstream(sync_flat_view)