Exemplo n.º 1
0
def tbv_envvar(klass,
               options,
               dev_options={},
               branch=None,
               tag=None,
               other={},
               metastore_location=None,
               artifact_url=None):
    """Set up environment variables for telemetry-batch-view jobs.

    The command line interface can read options from the environment. All
    environment variables must be prefixed by `TBV_`. For example, a class in
    telemetry-batch-view taking a `--date` option can use `TBV_DATE` instead.
    There is a limitation that spaces cannot be in environment variables, so
    ValueError is thrown if spaces are found outside templating brackets.

    :klass string:      name of the class in telemetry-batch-view
    :options dict:      environment variables to prefix
    :dev_options dict:  variables to use when in the development environment
    :branch string:     the branch to run the job from, incompatible with tag
    :tag string:        the tag to run the job from, incompatible with branch
    :other dict:        environment variables to pass through
    :metastore_location string: Location of the data-set metastore
    :artifact_url string:       Location of pre-built binaries

    :returns: a dictionary that contains properly prefixed class and options
    """
    if artifact_url is None:
        slug = "{{ task.__class__.telemetry_batch_view_slug }}"
        url = get_artifact_url(slug, branch=branch, tag=tag)
    else:
        url = artifact_url

    if EMRSparkOperator.deploy_environment == 'dev':
        options.update(dev_options)

    prefixed_options = {
        "TBV_{}".format(key.replace("-", "_")): value
        for key, value in options.items()
    }

    if klass is not None:
        prefixed_options["TBV_CLASS"] = klass
    else:
        assert other.get(
            "DO_SUBMIT",
            "True") == "False", "To submit there must be a class name"

    if metastore_location is not None:
        prefixed_options["METASTORE_LOCATION"] = metastore_location

    prefixed_options["ARTIFACT_URL"] = url
    prefixed_options.update(other)

    # raise ValueError if spaces found in non-templated envvar values
    for item in prefixed_options.values():
        if "{{" not in item and " " in item:
            raise ValueError("env cannot contain spaces: '{}'".format(item))

    return prefixed_options
Exemplo n.º 2
0
def tbv_envvar(klass, options, dev_options={}, branch=None, tag=None, other={},
               metastore_location=None, artifact_url=None):
    """Set up environment variables for telemetry-batch-view jobs.

    The command line interface can read options from the environment. All
    environment variables must be prefixed by `TBV_`. For example, a class in
    telemetry-batch-view taking a `--date` option can use `TBV_DATE` instead.
    There is a limitation that spaces cannot be in environment variables, so
    ValueError is thrown if spaces are found outside templating brackets.

    :klass string:      name of the class in telemetry-batch-view
    :options dict:      environment variables to prefix
    :dev_options dict:  variables to use when in the development environment
    :branch string:     the branch to run the job from, incompatible with tag
    :tag string:        the tag to run the job from, incompatible with branch
    :other dict:        environment variables to pass through
    :metastore_location string: Location of the data-set metastore
    :artifact_url string:       Location of pre-built binaries

    :returns: a dictionary that contains properly prefixed class and options
    """
    if artifact_url is None:
        slug = "{{ task.__class__.telemetry_batch_view_slug }}"
        url = get_artifact_url(slug, branch=branch, tag=tag)
    else:
        url = artifact_url

    if EMRSparkOperator.deploy_environment == 'dev':
        options.update(dev_options)

    prefixed_options = {
        "TBV_{}".format(key.replace("-", "_")): value
        for key, value in options.items()
    }

    if klass is not None:
        prefixed_options["TBV_CLASS"] = klass
    else:
        assert other.get("DO_SUBMIT", "True") == "False", "To submit there must be a class name"

    if metastore_location is not None:
        prefixed_options["METASTORE_LOCATION"] = metastore_location

    prefixed_options["ARTIFACT_URL"] = url
    prefixed_options.update(other)

    # raise ValueError if spaces found in non-templated envvar values
    for item in prefixed_options.values():
        if "{{" not in item and " " in item:
            raise ValueError("env cannot contain spaces: '{}'".format(item))

    return prefixed_options
}

dag = DAG('events_to_amplitude',
          default_args=default_args,
          schedule_interval='0 1 * * *')

focus_events_to_amplitude = EMRSparkOperator(
    task_id="focus_android_events_to_amplitude",
    job_name="Focus Android Events to Amplitude",
    execution_timeout=timedelta(hours=8),
    instance_count=FOCUS_ANDROID_INSTANCES,
    env={
        "date": "{{ ds_nodash }}",
        "max_requests": FOCUS_ANDROID_INSTANCES * VCPUS_PER_INSTANCE,
        "key_file": key_file("focus_android"),
        "artifact": get_artifact_url(slug, branch="master"),
        "config_filename": "focus_android_events_schemas.json",
    },
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/events_to_amplitude.sh",
    dag=dag)

devtools_prerelease_events_to_amplitude = EMRSparkOperator(
    task_id="devtools_prerelease_events_to_amplitude",
    job_name="DevTools Prerelease Events to Amplitude",
    execution_timeout=timedelta(hours=8),
    instance_count=DEVTOOLS_INSTANCES,
    email=['*****@*****.**', '*****@*****.**'],
    owner='*****@*****.**',
    env={
        "date": "{{ ds_nodash }}",
from operators.emr_spark_operator import EMRSparkOperator
from utils.constants import DS_WEEKLY
from utils.mozetl import mozetl_envvar
from utils.deploy import get_artifact_url

FOCUS_ANDROID_INSTANCES = 10
VCPUS_PER_INSTANCE = 16

environment = "{{ task.__class__.deploy_environment }}"
key_file = "s3://telemetry-airflow/config/amplitude/{}/apiKey".format(
    environment)
config_file = "focus_android_events_schemas.json"

slug = "{{ task.__class__.telemetry_streaming_slug }}"
tag = "v1.0.1"
url = get_artifact_url(slug, tag=tag)

default_args = {
    'owner': '*****@*****.**',
    'depends_on_past': False,
    'start_date': datetime(2018, 1, 1),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('events_to_amplitude',
          default_args=default_args,
          schedule_interval='0 1 * * *')
Exemplo n.º 5
0
    task_id="experiments_error_aggregates",
    job_name="Experiments Error Aggregates View",
    execution_timeout=timedelta(hours=5),
    instance_count=20,
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    env=tbv_envvar(
        "com.mozilla.telemetry.streaming.ExperimentsErrorAggregator",
        options={
            "from": "{{ ds_nodash }}",
            "to": "{{ds_nodash }}",
            "outputPath": "s3://{{ task.__class__.private_output_bucket }}",
            "numParquetFiles": "6"
        },
        dev_options={"channel": "nightly"},
        artifact_url=get_artifact_url(
            "{{ task.__class__.telemetry_streaming_slug }}")),
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py",
    dag=dag)

engagement_ratio = EMRSparkOperator(
    task_id="engagement_ratio",
    job_name="Update Engagement Ratio",
    execution_timeout=timedelta(hours=6),
    instance_count=10,
    env=mozetl_envvar("engagement_ratio",
                      options={
                          "input_bucket":
                          "{{ task.__class__.private_output_bucket }}",
                          "output_bucket":
                          "net-mozaws-prod-us-west-2-pipeline-analysis"
Exemplo n.º 6
0
from airflow import DAG
from datetime import datetime, timedelta
from operators.emr_spark_operator import EMRSparkOperator
from utils.constants import DS_WEEKLY
from utils.mozetl import mozetl_envvar
from utils.deploy import get_artifact_url

FOCUS_ANDROID_INSTANCES = 10
VCPUS_PER_INSTANCE = 16

environment = "{{ task.__class__.deploy_environment }}"
key_file = "s3://telemetry-airflow/config/amplitude/{}/apiKey".format(environment)
config_file = "focus_android_events_schemas.json"

slug = "{{ task.__class__.telemetry_streaming_slug }}"
url = get_artifact_url(slug)

default_args = {
    'owner': '*****@*****.**',
    'depends_on_past': False,
    'start_date': datetime(2018, 1, 1),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('events_to_amplitude', default_args=default_args, schedule_interval='0 1 * * *')

dag = DAG('events_to_amplitude',
          default_args=default_args,
          schedule_interval='0 1 * * *')

focus_events_to_amplitude = EMRSparkOperator(
    task_id="focus_android_events_to_amplitude",
    job_name="Focus Android Events to Amplitude",
    execution_timeout=timedelta(hours=8),
    instance_count=FOCUS_ANDROID_INSTANCES,
    env={
        "date": "{{ ds_nodash }}",
        "max_requests": FOCUS_ANDROID_INSTANCES * VCPUS_PER_INSTANCE,
        "key_file": key_file("focus_android"),
        # This Focus events job is pinned to a tag for now due to breaking changes in telemetry-streaming.
        "artifact": get_artifact_url(slug, tag="v1.0.1"),
        "config_filename": "focus_android_events_schemas.json",
    },
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/events_to_amplitude.sh",
    dag=dag)

devtools_events_to_amplitude = EMRSparkOperator(
    task_id="devtools_events_to_amplitude",
    job_name="DevTools Events to Amplitude",
    execution_timeout=timedelta(hours=8),
    instance_count=DEVTOOLS_INSTANCES,
    env={
        "date": "{{ ds_nodash }}",
        "max_requests": DEVTOOLS_INSTANCES * VCPUS_PER_INSTANCE,
        "key_file": key_file("devtools"),
from airflow import DAG
from datetime import datetime, timedelta
from operators.emr_spark_operator import EMRSparkOperator
from utils.deploy import get_artifact_url
from utils.tbv import tbv_envvar


slug = "{{ task.__class__.telemetry_streaming_slug }}"
url = get_artifact_url(slug)

default_args = {
    'owner': '*****@*****.**',
    'depends_on_past': False,
    'start_date': datetime(2018, 11, 26),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('event_ping_events', default_args=default_args, schedule_interval='0 1 * * *')

event_ping_events = EMRSparkOperator(
    task_id="event_ping_events",
    job_name="Event Ping Events Dataset",
    execution_timeout=timedelta(hours=8),
    instance_count=5,
    env=tbv_envvar("com.mozilla.telemetry.streaming.EventPingEvents", {
        "from": "{{ ds_nodash }}",
        "to": "{{ ds_nodash }}",