Exemplo n.º 1
0
    execution_delta=timedelta(
        days=-7, hours=-1
    ),  # main_summary waits one hour, execution date is beginning of the week
    dag=taar_weekly,
)

taar_ensemble = MozDatabricksSubmitRunOperator(
    task_id="taar_ensemble",
    job_name="TAAR Ensemble Model",
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    execution_timeout=timedelta(hours=11),
    instance_count=5,
    instance_type="i3.2xlarge",
    spot_bid_price_percent=100,
    max_instance_count=60,
    enable_autoscale=True,
    pypi_libs=[
        "mozilla-taar3==0.4.5",
        "mozilla-srgutil==0.1.10",
        "python-decouple==3.1",
    ],
    env=mozetl_envvar("taar_ensemble", {"date": "{{ ds_nodash }}"}),
    start_date=datetime(2019, 7, 14),
    uri=
    "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-databricks.py",
    output_visibility="private",
)

taar_ensemble.set_upstream(wait_for_clients_daily)
Exemplo n.º 2
0
          "bucket": "{{ task.__class__.private_output_bucket }}",
          "prefix": "taar/locale/"
    }),
    uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="private",
    dag=dag)

taar_similarity = MozDatabricksSubmitRunOperator(
    task_id="taar_similarity",
    job_name="Taar Similarity model",
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**", "*****@*****.**"],
    execution_timeout=timedelta(hours=2),
    instance_count=11,
    instance_type="i3.8xlarge",
    driver_instance_type="i3.xlarge",
    env=mozetl_envvar("taar_similarity",
        options={
            "date": "{{ ds_nodash }}",
            "bucket": "{{ task.__class__.private_output_bucket }}",
            "prefix": "taar/similarity/"
        }),
    uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="private",
    dag=dag)

taar_collaborative_recommender = EMRSparkOperator(
    task_id="addon_recommender",
    job_name="Train the Collaborative Addon Recommender",
    execution_timeout=timedelta(hours=10),
    instance_count=20,
    owner="*****@*****.**",
    schedule_interval="@daily",
)

prerelease_telemetry_aggregate_view = MozDatabricksSubmitRunOperator(
    task_id="prerelease_telemetry_aggregate_view",
    job_name="Prerelease Telemetry Aggregate View",
    instance_count=10,
    dev_instance_count=10,
    execution_timeout=timedelta(hours=12),
    python_version=2,
    env=mozetl_envvar(
        "aggregator",
        {
            "date": "{{ ds_nodash }}",
            "channels": "nightly,aurora,beta",
            "credentials-bucket": "telemetry-spark-emr-2",
            "credentials-prefix": "aggregator_database_envvars.json",
            "num-partitions": 10 * 32,
        },
        dev_options={
            "credentials-prefix": "aggregator_dev_database_envvars.json"
        },
        other={
            "MOZETL_GIT_PATH":
            "https://github.com/mozilla/python_mozaggregator.git",
            "MOZETL_EXTERNAL_MODULE": "mozaggregator",
        },
    ),
    dag=dag,
)
dag = DAG(
    "telemetry_aggregates_parquet",
    default_args=default_args,
    schedule_interval="@daily",
)

telemetry_aggregate_parquet_view = MozDatabricksSubmitRunOperator(
    task_id="telemetry_aggregate_parquet_view",
    job_name="Telemetry Aggregate Parquet View",
    instance_count=5,
    execution_timeout=timedelta(hours=12),
    python_version=2,
    env=mozetl_envvar(
        "parquet",
        {
            "date":
            "{{ ds_nodash }}",
            "channels":
            "nightly",
            "output":
            "s3://{{ task.__class__.private_output_bucket }}/aggregates_poc/v1",
        },
        other={
            "MOZETL_GIT_PATH":
            "https://github.com/mozilla/python_mozaggregator.git",
            "MOZETL_EXTERNAL_MODULE": "mozaggregator",
        },
    ),
    dag=dag,
)
Exemplo n.º 5
0
    aws_conn_id="aws_dev_iam_s3",
    dag=dag,
)

main_summary_all_histograms = MozDatabricksSubmitRunOperator(
    task_id="main_summary_all_histograms",
    job_name="Main Summary View - All Histograms",
    execution_timeout=timedelta(hours=12),
    instance_count=5,
    max_instance_count=50,
    enable_autoscale=True,
    instance_type="c4.4xlarge",
    spot_bid_price_percent=50,
    ebs_volume_count=1,
    ebs_volume_size=250,
    env=tbv_envvar("com.mozilla.telemetry.views.MainSummaryView",
                   options={
                       "from": "{{ ds_nodash }}",
                       "to": "{{ ds_nodash }}",
                       "bucket": "telemetry-backfill",
                       "all_histograms": "",
                       "read-mode": "aligned",
                       "input-partition-multiplier": "400",
                   },
                   dev_options={
                       "channel": "nightly",
                   }),
    dag=dag)

main_summary = MozDatabricksSubmitRunOperator(
    task_id="main_summary",
    job_name="Main Summary View",
Exemplo n.º 6
0
dag = DAG("mobile_aggregates",
          default_args=default_args,
          schedule_interval="@daily")

mobile_aggregate_view = MozDatabricksSubmitRunOperator(
    task_id="mobile_aggregate_view",
    job_name="Mobile Aggregate View",
    release_label="6.1.x-scala2.11",
    instance_count=5,
    execution_timeout=timedelta(hours=12),
    env=mozetl_envvar(
        "mobile",
        {
            "date": "{{ ds_nodash }}",
            "channels": "nightly",
            "output":
            "s3://{{ task.__class__.private_output_bucket }}/mobile_metrics_aggregates/v2",
            "num-partitions": 5 * 32
        },
        other={
            "MOZETL_GIT_PATH":
            "https://github.com/mozilla/python_mozaggregator.git",
            "MOZETL_EXTERNAL_MODULE": "mozaggregator",
        },
    ),
    dag=dag,
)

register_status(
    mobile_aggregate_view,
    "Mobile Aggregates",
Exemplo n.º 7
0
    dag=dag,
)

addons_daily = MozDatabricksSubmitRunOperator(
    task_id="addons_daily",
    job_name="Addons Daily",
    execution_timeout=timedelta(hours=4),
    instance_count=10,
    owner="*****@*****.**",
    email=[
        "*****@*****.**",
        "*****@*****.**",
        "*****@*****.**",
        "*****@*****.**",
        "*****@*****.**",
    ],
    env=mozetl_envvar(
        "addons_report",
        {
            "date": "{{ ds_nodash }}",
            "deploy_environment": "{{ task.__class__.deploy_environment }}",
        },
        other={
            "MOZETL_GIT_PATH": "https://github.com/mozilla/addons_daily.git",
            "MOZETL_EXTERNAL_MODULE": "addons_daily",
        },
    ),
    dag=dag,
)

addons_daily.set_upstream(wait_for_search_clients_daily)
# Make sure all the data for the given day has arrived before running.
# Running at 1am should suffice.
dag = DAG('first_shutdown_summary',
          default_args=default_args,
          schedule_interval='0 1 * * *')

first_shutdown_summary = MozDatabricksSubmitRunOperator(
    task_id="first_shutdown_summary",
    job_name="First Shutdown Summary View",
    execution_timeout=timedelta(hours=4),
    instance_count=5,
    env=tbv_envvar(
        "com.mozilla.telemetry.views.MainSummaryView", {
            "from": "{{ ds_nodash }}",
            "to": "{{ ds_nodash }}",
            "bucket": "{{ task.__class__.private_output_bucket }}",
            "doc-type": "first_shutdown",
            "read-mode": "aligned",
            "input-partition-multiplier": "4"
        }),
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py",
    dag=dag)

first_shutdown_summary_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="first_shutdown_summary_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
Exemplo n.º 9
0
default_args = {
    'owner': '*****@*****.**',
    'depends_on_past': False,
    'start_date': datetime(2018, 9, 10),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('landfill', default_args=default_args, schedule_interval='0 1 * * *')

landfill_sampler = MozDatabricksSubmitRunOperator(
    task_id="landfill_sampler",
    job_name="Landfill Sampler",
    execution_timeout=timedelta(hours=2),
    instance_count=3,
    iam_role=
    "arn:aws:iam::144996185633:instance-profile/databricks-ec2-landfill",
    env=mozetl_envvar(
        "landfill_sampler", {
            "submission-date": "{{ ds_nodash }}",
            "bucket": "{{ task.__class__.private_output_bucket }}",
            "prefix": "sanitized-landfill-sample",
        }),
    uri=
    "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    dag=dag)
Exemplo n.º 10
0
# Make sure all the data for the given day has arrived before running.
# Running at 1am should suffice.
dag = DAG('main_summary', default_args=default_args, schedule_interval='0 1 * * *')

main_summary_all_histograms = MozDatabricksSubmitRunOperator(
    task_id="main_summary_all_histograms",
    job_name="Main Summary View - All Histograms",
    execution_timeout=timedelta(hours=12),
    instance_count=5,
    max_instance_count=50,
    enable_autoscale=True,
    instance_type="c4.4xlarge",
    spot_bid_price_percent=50,
    ebs_volume_count=1,
    ebs_volume_size=250,
    env=tbv_envvar("com.mozilla.telemetry.views.MainSummaryView",
        options={
            "from": "{{ ds_nodash }}",
            "to": "{{ ds_nodash }}",
            "bucket": "telemetry-backfill",
            "all_histograms": "",
            "read-mode": "aligned",
            "input-partition-multiplier": "400",
        },
        dev_options={
            "channel": "nightly",
        }),
    dag=dag)

main_summary = MozDatabricksSubmitRunOperator(
    task_id="main_summary",
    job_name="Main Summary View",
Exemplo n.º 11
0
        dataset="churn",
        dataset_version="v3",
        date_submission_col="week_start",
        gke_cluster_name="bq-load-gke-1",
        ),
    task_id="churn_bigquery_load",
    dag=dag)

churn_v2 = MozDatabricksSubmitRunOperator(
    task_id="churn_v2",
    job_name="churn 7-day v2",
    execution_timeout=timedelta(hours=4),
    instance_count=5,
    env=mozetl_envvar("churn", {
        "start_date": "{{ ds_nodash }}",
        "bucket": "{{ task.__class__.private_output_bucket }}"
    }, other={
        "MOZETL_GIT_BRANCH": "churn-v2"
    }),
    # the mozetl branch was forked before python 3 support
    python_version=2,
    uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="public",
    dag=dag)

churn_v2_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="churn_v2_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
Exemplo n.º 12
0
          schedule_interval='0 1 * * *')

main_summary_all_histograms = MozDatabricksSubmitRunOperator(
    task_id="main_summary_all_histograms",
    job_name="Main Summary View - All Histograms",
    execution_timeout=timedelta(hours=12),
    instance_count=5,
    max_instance_count=50,
    enable_autoscale=True,
    instance_type="c4.4xlarge",
    spot_bid_price_percent=50,
    ebs_volume_count=1,
    ebs_volume_size=250,
    env=tbv_envvar(
        "com.mozilla.telemetry.views.MainSummaryView",
        options={
            "from": "{{ ds_nodash }}",
            "to": "{{ ds_nodash }}",
            "schema-report-location":
            "s3://{{ task.__class__.private_output_bucket }}/schema/main_summary/submission_date_s3={{ ds_nodash }}",
            "bucket": "telemetry-backfill",
            "all_histograms": "",
            "read-mode": "aligned",
            "input-partition-multiplier": "400",
        },
        dev_options={
            "channel": "nightly",
        }),
    dag=dag)

main_summary = EMRSparkOperator(
Exemplo n.º 13
0
        aws_conn_id=taar_aws_conn_id,
        gcp_conn_id=taar_gcpdataproc_conn_id,
    ),
    dag=dag,
)

taar_similarity = MozDatabricksSubmitRunOperator(
    task_id="taar_similarity",
    job_name="Taar Similarity model",
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**", "*****@*****.**"],
    execution_timeout=timedelta(hours=2),
    instance_count=11,
    instance_type="i3.8xlarge",
    driver_instance_type="i3.xlarge",
    env=mozetl_envvar("taar_similarity",
                      options={
                          "date": "{{ ds_nodash }}",
                          "bucket":
                          "{{ task.__class__.private_output_bucket }}",
                          "prefix": "taar/similarity/"
                      }),
    uri=
    "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="private",
    dag=dag)

taar_collaborative_recommender = SubDagOperator(
    task_id="addon_recommender",
    subdag=moz_dataproc_jar_runner(
        parent_dag_name=dag.dag_id,
        dag_name="addon_recommender",
Exemplo n.º 14
0
dag = DAG('crash_summary',
          default_args=default_args,
          schedule_interval='@daily')

# we deliberately do not autoscale this job, as it seems that the bottleneck is not
# the CPU
crash_summary_view = MozDatabricksSubmitRunOperator(
    task_id="crash_summary_view",
    job_name="Crash Summary View",
    dev_instance_count=1,
    instance_count=1,
    instance_type="c4.4xlarge",
    ebs_volume_count=1,
    ebs_volume_size=250,
    execution_timeout=timedelta(hours=4),
    env=tbv_envvar(
        "com.mozilla.telemetry.views.CrashSummaryView", {
            "from": "{{ ds_nodash }}",
            "to": "{{ ds_nodash }}",
            "outputBucket": "{{ task.__class__.private_output_bucket }}"
        }),
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py",
    dag=dag)

crash_summary_view_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="crash_summary_view_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
Exemplo n.º 15
0
    job_name="A placeholder for the implicit clients daily dependency",
    dag=dag,
)

bgbb_fit = MozDatabricksSubmitRunOperator(
    task_id="bgbb_fit",
    job_name="Fit parameters for a BGBB model to determine active profiles",
    execution_timeout=timedelta(hours=2),
    instance_count=3,
    env=mozetl_envvar(
        "bgbb_fit",
        {
            "submission-date": "{{ next_ds }}",
            "model-win": "120",
            "start-params": "[0.387, 0.912, 0.102, 1.504]",
            "sample-ids": "[42]",
            "sample-fraction": "1.0",
            "penalizer-coef": "0.01",
            "bucket": "{{ task.__class__.private_output_bucket }}",
            "prefix": "bgbb/params/v1",
        },
        dev_options={"model-win": "30"},
        other={
            "MOZETL_GIT_PATH": "https://github.com/wcbeard/bgbb_airflow.git",
            "MOZETL_EXTERNAL_MODULE": "bgbb_airflow",
        },
    ),
    dag=dag,
)

clients_daily_v6_dummy >> bgbb_fit
Exemplo n.º 16
0
    2,
    'retry_delay':
    timedelta(minutes=30),
}

dag = DAG('longitudinal',
          default_args=default_args,
          schedule_interval='@weekly')

longitudinal = MozDatabricksSubmitRunOperator(
    task_id="longitudinal",
    job_name="Longitudinal View",
    execution_timeout=timedelta(hours=12),
    instance_count=16,
    instance_type="i3.8xlarge",
    env=tbv_envvar("com.mozilla.telemetry.views.LongitudinalView", {
        "bucket": "{{ task.__class__.private_output_bucket }}",
        "to": DS_WEEKLY
    },
                   metastore_location="s3://telemetry-parquet/longitudinal"),
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py",
    dag=dag)

register_status(longitudinal, "Longitudinal",
                "A 6-month longitudinal view of client history.")

addon_recommender = EMRSparkOperator(
    task_id="addon_recommender",
    job_name="Train the Addon Recommender",
    execution_timeout=timedelta(hours=10),
    instance_count=20,
Exemplo n.º 17
0
from airflow import DAG
from airflow.operators.moz_databricks import MozDatabricksSubmitRunOperator
from datetime import datetime, timedelta
from utils.mozetl import mozetl_envvar

default_args = {
    'owner': '*****@*****.**',
    'depends_on_past': False,
    'start_date': datetime(2018, 11, 26),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 3,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('tab_spinner_severity',
          default_args=default_args,
          schedule_interval='@daily')

update_tab_spinner_severity = MozDatabricksSubmitRunOperator(
    task_id="update_tab_spinner_severity",
    job_name="Tab Spinner Severity Job",
    execution_timeout=timedelta(hours=12),
    instance_count=12,
    env=mozetl_envvar("long_tab_spinners", {}),
    uri=
    "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    dag=dag)