Пример #1
0
def bigquery_xcom_query(
    destination_table,
    dataset_id,
    xcom_task_id,
    parameters=(),
    arguments=(),
    project_id=None,
    gcp_conn_id="google_cloud_derived_datasets",
    gke_location="us-central1-a",
    gke_cluster_name="bq-load-gke-1",
    gke_namespace="default",
    docker_image="mozilla/bigquery-etl:latest",
    date_partition_parameter="submission_date",
    **kwargs
):
    """ Generate a GKEPodOperator which runs an xcom result as a bigquery query.

    :param str destination_table:                  [Required] BigQuery destination table
    :param str dataset_id:                         [Required] BigQuery default dataset id
    :param str xcom_task_id:                       [Required] task_id which generated the xcom to pull
    :param Tuple[str] parameters:                  Parameters passed to bq query
    :param Tuple[str] arguments:                   Additional bq query arguments
    :param Optional[str] project_id:               BigQuery default project id
    :param str gcp_conn_id:                        Airflow connection id for GCP access
    :param str gke_location:                       GKE cluster location
    :param str gke_cluster_name:                   GKE cluster name
    :param str gke_namespace:                      GKE cluster namespace
    :param str docker_image:                       docker image to use
    :param Optional[str] date_partition_parameter: Parameter for indicating destination
                                                   partition to generate, if None
                                                   destination should be whole table
                                                   rather than partition
    :param Dict[str, Any] kwargs:                  Additional keyword arguments for
                                                   GKEPodOperator

    :return: GKEPodOperator
    """
    kwargs["task_id"] = kwargs.get("task_id", destination_table)
    kwargs["name"] = kwargs.get("name", kwargs["task_id"].replace("_", "-"))
    if destination_table is not None and date_partition_parameter is not None:
        destination_table = destination_table + "${{ds_nodash}}"
        parameters += (date_partition_parameter + ":DATE:{{ds}}",)
    query = "{{ " + "task_instance.xcom_pull({!r})".format(xcom_task_id) + " }}"
    return GKEPodOperator(
        gcp_conn_id=gcp_conn_id,
        project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
        location=gke_location,
        cluster_name=gke_cluster_name,
        namespace=gke_namespace,
        image=docker_image,
        arguments=["bq"]
        + ["query"]
        + (["--destination_table=" + destination_table] if destination_table else [])
        + ["--dataset_id=" + dataset_id]
        + (["--project_id=" + project_id] if project_id else [])
        + ["--parameter=" + parameter for parameter in parameters]
        + list(arguments)
        + [query],
        **kwargs
    )
Пример #2
0
def bigquery_etl_query(destination_table,
                       dataset_id,
                       parameters=(),
                       arguments=(),
                       project_id=None,
                       sql_file_path=None,
                       gcp_conn_id="google_cloud_derived_datasets",
                       gke_location="us-central1-a",
                       gke_cluster_name="bq-load-gke-1",
                       gke_namespace="default",
                       docker_image="mozilla/bigquery-etl:latest",
                       date_partition_parameter="submission_date",
                       multipart=False,
                       **kwargs):
    """ Generate.

    :param str destination_table:                  [Required] BigQuery destination table
    :param str dataset_id:                         [Required] BigQuery default dataset id
    :param Tuple[str] parameters:                  Parameters passed to bq query
    :param Tuple[str] arguments:                   Additional bq query arguments
    :param Optional[str] project_id:               BigQuery default project id
    :param Optional[str] sql_file_path:            Optional override for path to the
                                                   SQL query file to run
    :param str gcp_conn_id:                        Airflow connection id for GCP access
    :param str gke_location:                       GKE cluster location
    :param str gke_cluster_name:                   GKE cluster name
    :param str gke_namespace:                      GKE cluster namespace
    :param str docker_image:                       docker image to use
    :param Optional[str] date_partition_parameter: Parameter for indicating destination
                                                   partition to generate, if None
                                                   destination should be whole table
                                                   rather than partition
    :param Dict[str, Any] kwargs:                  Additional keyword arguments for
                                                   GKEPodOperator

    :return: GKEPodOperator
    """
    kwargs["task_id"] = kwargs.get("task_id", destination_table)
    kwargs["name"] = kwargs.get("name", kwargs["task_id"].replace("_", "-"))
    sql_file_path = sql_file_path or "sql/{}/{}/query.sql".format(
        dataset_id, destination_table)
    if destination_table is not None and date_partition_parameter is not None:
        destination_table = destination_table + "${{ds_nodash}}"
        parameters += (date_partition_parameter + ":DATE:{{ds}}", )
    return GKEPodOperator(
        gcp_conn_id=gcp_conn_id,
        project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
        location=gke_location,
        cluster_name=gke_cluster_name,
        namespace=gke_namespace,
        image=docker_image,
        arguments=["script/run_multipart_query" if multipart else "query"] +
        (["--destination_table=" +
          destination_table] if destination_table else []) +
        ["--dataset_id=" + dataset_id] +
        (["--project_id=" + project_id] if project_id else []) +
        ["--parameter=" + parameter
         for parameter in parameters] + list(arguments) + [sql_file_path],
        **kwargs)
Пример #3
0
def bigquery_etl_copy_deduplicate(task_id,
                                  target_project_id,
                                  only_tables=None,
                                  except_tables=None,
                                  parallelism=4,
                                  priority="INTERACTIVE",
                                  hourly=False,
                                  slices=None,
                                  gcp_conn_id="google_cloud_derived_datasets",
                                  gke_location="us-central1-a",
                                  gke_cluster_name="bq-load-gke-1",
                                  gke_namespace="default",
                                  docker_image="mozilla/bigquery-etl:latest",
                                  **kwargs):
    """ Copy a day's data from live ping tables to stable ping tables,
    deduplicating on document_id.

    :param str task_id:              [Required] ID for the task
    :param str target_project_id:    [Required] ID of project where target tables live
    :param Tuple[str] only_tables:   Only process tables matching the given globs of form 'telemetry_live.main_v*'
    :param Tuple[str] except_tables: Process all tables except those matching the given globs
    :param int parallelism:          Maximum number of queries to execute concurrently
    :param str priority:             BigQuery query priority to use, must be BATCH or INTERACTIVE
    :param bool hourly:              Alias for --slices=24
    :param int slices:               Number of time-based slices to deduplicate in, rather than for whole days at once
    :param str gcp_conn_id:          Airflow connection id for GCP access
    :param str gke_location:         GKE cluster location
    :param str gke_cluster_name:     GKE cluster name
    :param str gke_namespace:        GKE cluster namespace
    :param str docker_image:         docker image to use
    :param Dict[str, Any] kwargs:    Additional keyword arguments for
                                     GKEPodOperator

    :return: GKEPodOperator
    """
    kwargs["name"] = kwargs.get("name", task_id.replace("_", "-"))
    table_qualifiers = []
    if only_tables:
        table_qualifiers.append('--only')
        table_qualifiers += only_tables
    if except_tables:
        table_qualifiers.append('--except')
        table_qualifiers += except_tables
    return GKEPodOperator(
        task_id=task_id,
        gcp_conn_id=gcp_conn_id,
        project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
        location=gke_location,
        cluster_name=gke_cluster_name,
        namespace=gke_namespace,
        image=docker_image,
        arguments=["script/copy_deduplicate"] +
        ["--project-id=" + target_project_id] + ["--date={{ds}}"] +
        ["--parallelism={}".format(parallelism)] +
        ["--priority={}".format(priority)] + (["--hourly"] if hourly else []) +
        (["--slices={}".format(slices)] if slices is not None else []) +
        table_qualifiers,
        **kwargs)
Пример #4
0
def bigquery_etl_copy_deduplicate(
    task_id,
    target_project_id,
    only_tables=None,
    except_tables=None,
    parallelism=4,
    gcp_conn_id="google_cloud_derived_datasets",
    gke_location="us-central1-a",
    gke_cluster_name="bq-load-gke-1",
    gke_namespace="default",
    docker_image="mozilla/bigquery-etl:latest",
    image_pull_policy="Always",
    **kwargs
):
    """ Copy a day's data from live ping tables to stable ping tables,
    deduplicating on document_id.

    :param str task_id:              [Required] ID for the task
    :param str target_project_id:    [Required] ID of project where target tables live
    :param Tuple[str] only_tables:   Only process tables matching the given globs of form 'telemetry_live.main_v*'
    :param Tuple[str] except_tables: Process all tables except those matching the given globs
    :param int parallelism:          Maximum number of queries to execute concurrently
    :param str gcp_conn_id:          Airflow connection id for GCP access
    :param str gke_location:         GKE cluster location
    :param str gke_cluster_name:     GKE cluster name
    :param str gke_namespace:        GKE cluster namespace
    :param str docker_image:         docker image to use
    :param str image_pull_policy:    Kubernetes policy for when to pull
                                     docker_image
    :param Dict[str, Any] kwargs:    Additional keyword arguments for
                                     GKEPodOperator

    :return: GKEPodOperator
    """
    kwargs["name"] = kwargs.get("name", task_id.replace("_", "-"))
    table_qualifiers = []
    if only_tables:
        table_qualifiers.append('--only')
        table_qualifiers += only_tables
    if except_tables:
        table_qualifiers.append('--except')
        table_qualifiers += except_tables
    return GKEPodOperator(
        task_id=task_id,
        gcp_conn_id=gcp_conn_id,
        project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
        location=gke_location,
        cluster_name=gke_cluster_name,
        namespace=gke_namespace,
        image=docker_image,
        arguments=["script/copy_deduplicate"]
        + ["--project-id=" + target_project_id]
        + ["--date={{ds}}"]
        + ["--parallelism={}".format(parallelism)]
        + table_qualifiers,
        image_pull_policy=image_pull_policy,
        **kwargs
    )
Пример #5
0
def gke_command(
    task_id,
    command,
    docker_image,
    aws_conn_id="aws_dev_iam_s3",
    gcp_conn_id="google_cloud_derived_datasets",
    gke_location="us-central1-a",
    gke_cluster_name="bq-load-gke-1",
    gke_namespace="default",
    image_pull_policy="Always",
    xcom_push=False,
    env_vars={},
    **kwargs
):
    """ Run a docker command on GKE

    :param str task_id:            [Required] ID for the task
    :param List[str] command:      [Required] Command to run
    :param str docker_image:       [Required] docker image to use
    :param str aws_conn_id:        Airflow connection id for AWS access
    :param str gcp_conn_id:        Airflow connection id for GCP access
    :param str gke_location:       GKE cluster location
    :param str gke_cluster_name:   GKE cluster name
    :param str gke_namespace:      GKE cluster namespace
    :param str image_pull_policy:  Kubernetes policy for when to pull
                                   docker_image
    :param bool xcom_push:         Return the output of this command as an xcom
    :param Dict[str, Any] kwargs:  Additional keyword arguments for
                                   GKEPodOperator

    :return: GKEPodOperator
    """
    kwargs["name"] = kwargs.get("name", task_id.replace("_", "-"))
    context_env_vars = {
        key: value
        for key, value in zip(
            ("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_SESSION_TOKEN"),
            AwsHook(aws_conn_id).get_credentials() if aws_conn_id else (),
        )
        if value is not None}
    context_env_vars["XCOM_PUSH"] = json.dumps(xcom_push)
    context_env_vars.update(env_vars)

    return GKEPodOperator(
        task_id=task_id,
        gcp_conn_id=gcp_conn_id,
        project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
        location=gke_location,
        cluster_name=gke_cluster_name,
        namespace=gke_namespace,
        image=docker_image,
        arguments=command,
        image_pull_policy=image_pull_policy,
        xcom_push=xcom_push,
        env_vars=context_env_vars,
        **kwargs
    )
Пример #6
0
def bigquery_etl_query(
    destination_table,
    parameters=(),
    arguments=(),
    gcp_conn_id="google_cloud_derived_datasets",
    gke_location="us-central1-a",
    gke_cluster_name="bq-load-gke-1",
    gke_namespace="default",
    docker_image="mozilla/bigquery-etl:latest",
    image_pull_policy="Always",
    date_partition_parameter="submission_date",
    **kwargs
):
    """ Generate.

    :param str destination_table:                  [Required] BigQuery destination table
    :param Tuple[str] parameters:                  Parameters passed to bq query
    :param Tuple[str] arguments:                   Additional bq query arguments
    :param str gcp_conn_id:                        Airflow connection id for GCP access
    :param str gke_location:                       GKE cluster location
    :param str gke_cluster_name:                   GKE cluster name
    :param str gke_namespace:                      GKE cluster namespace
    :param str docker_image:                       docker image to use
    :param str image_pull_policy:                  Kubernetes policy for when to pull
                                                   docker_image
    :param Optional[str] date_partition_parameter: Parameter for indicating destination
                                                   partition to generate, if None
                                                   destination should be whole table
                                                   rather than partition
    :param Dict[str, Any] kwargs:                  Additional keyword arguments for
                                                   GKEPodOperator

    :return: GKEPodOperator
    """
    kwargs["task_id"] = kwargs.get("task_id", destination_table)
    kwargs["name"] = kwargs.get("name", kwargs["task_id"].replace("_", "-"))
    sql_file_path = "sql/{}.sql".format(destination_table)
    if date_partition_parameter is not None:
        destination_table = destination_table + "${{ds_nodash}}"
        parameters += (date_partition_parameter + ":DATE:{{ds}}",)
    return GKEPodOperator(
        gcp_conn_id=gcp_conn_id,
        project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
        location=gke_location,
        cluster_name=gke_cluster_name,
        namespace=gke_namespace,
        image=docker_image,
        arguments=["query"]
        + ["--destination_table=" + destination_table]
        + ["--parameter=" + parameter for parameter in parameters]
        + list(arguments)
        + [sql_file_path],
        image_pull_policy=image_pull_policy,
        **kwargs
    )
Пример #7
0
def burnham_bigquery_run(
    task_id,
    project_id,
    burnham_test_run,
    burnham_test_scenarios,
    gcp_conn_id=DEFAULT_GCP_CONN_ID,
    gke_location=DEFAULT_GKE_LOCATION,
    gke_cluster_name=DEFAULT_GKE_CLUSTER_NAME,
    gke_namespace=DEFAULT_GKE_NAMESPACE,
    **kwargs,
):
    """Create a new GKEPodOperator that runs the burnham-bigquery Docker image.

    :param str task_id:                 [Required] ID for the task
    :param str project_id:              [Required] Project ID where target table lives
    :param str burnham_test_run:        [Required] UUID for the test run
    :param str burnham_test_scenarios:  [Required] Encoded burnham test scenarios

    :param str gcp_conn_id:             Airflow connection id for GCP access
    :param str gke_location:            GKE cluster location
    :param str gke_cluster_name:        GKE cluster name
    :param str gke_namespace:           GKE cluster namespace
    :param Dict[str, Any] kwargs:       Additional kwargs for GKEPodOperator

    :return: GKEPodOperator
    """
    kwargs["name"] = kwargs.get("name", task_id.replace("_", "-"))

    return GKEPodOperator(
        task_id=task_id,
        gcp_conn_id=gcp_conn_id,
        project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
        location=gke_location,
        cluster_name=gke_cluster_name,
        namespace=gke_namespace,
        image="gcr.io/moz-fx-data-airflow-prod-88e0/burnham-bigquery:latest",
        image_pull_policy="Always",
        arguments=[
            "-vv",
            "--project-id",
            project_id,
            "--run-id",
            burnham_test_run,
            "--scenarios",
            burnham_test_scenarios,
            "--results-table",
            "burnham_derived.test_results_v1",
            "--log-url",
            "{{ task_instance.log_url }}",
            "--start-timestamp",
            "{{ dag_run.start_date.isoformat() }}",
        ],
        **kwargs,
    )
Пример #8
0
def burnham_run(
    task_id,
    burnham_test_run,
    burnham_test_name,
    burnham_missions,
    burnham_spore_drive=None,
    gcp_conn_id=DEFAULT_GCP_CONN_ID,
    gke_location=DEFAULT_GKE_LOCATION,
    gke_cluster_name=DEFAULT_GKE_CLUSTER_NAME,
    gke_namespace=DEFAULT_GKE_NAMESPACE,
    **kwargs,
):
    """Create a new GKEPodOperator that runs the burnham Docker image.

    :param str task_id:                         [Required] ID for the task
    :param str burnham_test_run:                [Required] UUID for the test run
    :param str burnham_test_name:               [Required] Name for the test item
    :param List[str] burnham_missions:          [Required] List of mission identifiers

    :param Optional[str] burnham_spore_drive:   Interface for the spore-drive technology
    :param str gcp_conn_id:                     Airflow connection id for GCP access
    :param str gke_location:                    GKE cluster location
    :param str gke_cluster_name:                GKE cluster name
    :param str gke_namespace:                   GKE cluster namespace
    :param Dict[str, Any] kwargs:               Additional kwargs for GKEPodOperator

    :return: GKEPodOperator
    """
    kwargs["name"] = kwargs.get("name", task_id.replace("_", "-"))

    env_vars = {
        "BURNHAM_PLATFORM_URL": BURNHAM_PLATFORM_URL,
        "BURNHAM_TEST_RUN": burnham_test_run,
        "BURNHAM_TEST_NAME": burnham_test_name,
        "BURNHAM_VERBOSE": "true",
        "GLEAN_LOG_PINGS": "true",
    }

    if burnham_spore_drive is not None:
        env_vars["BURNHAM_SPORE_DRIVE"] = burnham_spore_drive

    return GKEPodOperator(
        task_id=task_id,
        gcp_conn_id=gcp_conn_id,
        project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
        location=gke_location,
        cluster_name=gke_cluster_name,
        namespace=gke_namespace,
        image="gcr.io/moz-fx-data-airflow-prod-88e0/burnham:latest",
        image_pull_policy="Always",
        env_vars=env_vars,
        arguments=burnham_missions,
        **kwargs,
    )
Пример #9
0
def simpleprophet_forecast(
    task_id,
    datasource,
    project_id,
    dataset_id,
    table_id,
    gcp_conn_id="google_cloud_derived_datasets",
    gke_location="us-central1-a",
    gke_cluster_name="bq-load-gke-1",
    gke_namespace="default",
    docker_image="gcr.io/moz-fx-data-forecasting/simpleprophet:latest",
    image_pull_policy="Always",
    **kwargs
):
    """Run all simpleprophet models for the given datasource and model date.

    :param str task_id:              [Required] ID for the task
    :param str datasource:           [Required] One of desktop, mobile, fxa
    :param str project_id:           [Required] ID of project where target table lives
    :param str dataset_id:           [Required] ID of dataset where target table lives
    :param str table_id:             [Required] ID of target table

    :param str gcp_conn_id:          Airflow connection id for GCP access
    :param str gke_location:         GKE cluster location
    :param str gke_cluster_name:     GKE cluster name
    :param str gke_namespace:        GKE cluster namespace
    :param str docker_image:         docker image to use
    :param str image_pull_policy:    Kubernetes policy for when to pull
                                     docker_image
    :param Dict[str, Any] kwargs:    Additional keyword arguments for
                                     GKEPodOperator

    :return: GKEPodOperator
    """
    kwargs["name"] = kwargs.get("name", task_id.replace("_", "-"))
    return GKEPodOperator(
        task_id=task_id,
        gcp_conn_id=gcp_conn_id,
        project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
        location=gke_location,
        cluster_name=gke_cluster_name,
        namespace=gke_namespace,
        image=docker_image,
        arguments=["{{ds}}"]
        + ["--datasource=" + datasource]
        + ["--project-id=" + project_id]
        + ["--dataset-id=" + dataset_id]
        + ["--table-id=" + table_id],
        image_pull_policy=image_pull_policy,
        **kwargs
    )
Пример #10
0
    objects_prefix,
    "--cluster-by",
    "crash_date",
]

# We remove the current date partition for idempotency.
remove_bq_table_partition = BigQueryTableDeleteOperator(
    task_id="remove_bq_table_partition",
    bigquery_conn_id=bq_gcp_conn_id,
    deletion_dataset_table="{}.{}${{{{ds_nodash}}}}".format(
        bq_dataset, bq_table_name),
    ignore_if_missing=True,
    dag=dag,
)

bq_load = GKEPodOperator(
    task_id="bigquery_load",
    gcp_conn_id=bq_gcp_conn_id,
    project_id=bq_connection.project_id,
    name="load-socorro-crash-parquet-to-bq",
    image=docker_image,
    arguments=gke_args,
    env_vars={
        "GOOGLE_CLOUD_PROJECT": "{{ var.value.gcp_shared_prod_project }}"
    },
    dag=dag,
)

s3_to_gcs >> crash_report_parquet
crash_report_parquet >> remove_bq_table_partition >> bq_load
Пример #11
0
def container_subdag(
    parent_dag_name,
    child_dag_name,
    default_args,
    gcp_conn_id,
    service_account,
    server_id,
    env_vars={},
    arguments=[],
    machine_type="n1-standard-1",
    image="mozilla/prio-processor:latest",
    location="us-west1-b",
    owner_label="amiyaguchi",
    team_label="dataeng",
):
    """Run a command on an ephemeral container running the
    `mozilla/prio-processor:latest` image.

    :param str parent_dag_name:         Name of the parent DAG.
    :param str child_dag_name:          Name of the child DAG.
    :param Dict[str, Any] default_args: Default arguments for the child DAG.
    :param str gcp_conn_id:             Name of the connection string.
    :param str service_account:         The address of the service account.
    :param str server_id:               The identifier for the Prio processor
    :param Dict[str, str] env_vars:     Environment variables for configuring
                                        the processor.
    :param List[str] arguments:         The command to run after loading the
                                        image.
    :param str machine_type:            The machine type for running the image.
    :param str image:                   Dockerhub image
    :param str location:                The region of the GKE cluster.
    :param str owner_label:             Label for associating the owner
    :param str team_label:              Label for associating the team
    :return: DAG
    """
    assert server_id in ["a", "b", "admin"]

    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

    cluster_name = "gke-prio-{}".format(server_id)

    shared_config = {
        "project_id": connection.project_id,
        "gcp_conn_id": gcp_conn_id,
        "location": location,
    }

    with DAG("{}.{}".format(parent_dag_name, child_dag_name),
             default_args=default_args) as dag:
        create_gke_cluster = GKEClusterCreateOperator(
            task_id="create_gke_cluster",
            body=create_gke_config(
                name=cluster_name,
                service_account=service_account,
                owner_label=owner_label,
                team_label=team_label,
                machine_type=machine_type,
                # DataProc clusters require VPC with auto-created subnets
                subnetwork="default" if server_id == "admin" else "gke-subnet",
                is_dev=environ.get("DEPLOY_ENVIRONMENT") == "dev",
            ),
            dag=dag,
            **shared_config)

        # Running the pod without any time in-between will cause the scope-based
        # authentication in Google Cloud Platform to fail. For example:
        #
        # `ServiceException: 401 Anonymous caller does not have
        # storage.objects.get access to moz-fx-prio-dev-a-private/processed/`
        #
        # Sleeping by a small amount solves this problem. This issue was first
        # noticed intermittently on 2019-09-09.
        sleep = BashOperator(task_id="sleep", bash_command="sleep 60", dag=dag)

        run_prio = GKEPodOperator(task_id="processor_{}".format(server_id),
                                  name="run-prio-project-{}".format(server_id),
                                  cluster_name=cluster_name,
                                  namespace="default",
                                  image=image,
                                  arguments=arguments,
                                  env_vars=env_vars,
                                  dag=dag,
                                  **shared_config)

        delete_gke_cluster = GKEClusterDeleteOperator(
            task_id="delete_gke_cluster",
            name=cluster_name,
            trigger_rule="all_done",
            dag=dag,
            **shared_config)

        create_gke_cluster >> sleep >> run_prio >> delete_gke_cluster
        return dag
Пример #12
0
    # Cluster autoscaling works on pod resource requests, instead of usage
    resources = {'request_memory':'13312Mi', 'request_cpu': None,
                 'limit_memory':'20480Mi', 'limit_cpu': None, 'limit_gpu': None}

    probe_scraper = GKEPodOperator(
        task_id="probe_scraper",
        name='probe-scraper',
        # Needed to scale the highmem pool from 0 -> 1
        resources=resources,
        # This python job requires 13 GB of memory, thus the highmem node pool
        node_selectors={"nodepool" : "highmem"},
        # Due to the nature of the container run, we set get_logs to False,
        # To avoid urllib3.exceptions.ProtocolError: 'Connection broken: IncompleteRead(0 bytes read)' errors
        # Where the pod continues to run, but airflow loses its connection and sets the status to Failed
        get_logs=False,
        # Give additional time since we will likely always scale up when running this job
        startup_timeout_seconds=360,
        image=probe_scraper_image,
        arguments=probe_scraper_args,
        email=['*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**'],
        env_vars={
            "AWS_ACCESS_KEY_ID": aws_access_key,
            "AWS_SECRET_ACCESS_KEY": aws_secret_key
        },
        dag=dag)

    schema_generator = GKEPodOperator(
        email=['*****@*****.**'],
        task_id='mozilla_schema_generator',
        name='schema-generator-1',
    baseline_etl_kwargs = dict(
        gcp_conn_id=gcp_conn_id,
        project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
        location="us-central1-a",
        cluster_name="bq-load-gke-1",
        namespace="default",
        image="mozilla/bigquery-etl:latest",
    )
    baseline_args = [
        "--project-id=moz-fx-data-shared-prod",
        "--date={{ ds }}",
        "--only=*_stable.baseline_v1"
    ]
    baseline_clients_daily = GKEPodOperator(
        task_id='baseline_clients_daily',
        name='baseline-clients-daily',
        arguments=["script/run_glean_baseline_clients_daily"] + baseline_args,
        **baseline_etl_kwargs
    )
    baseline_clients_last_seen = GKEPodOperator(
        task_id='baseline_clients_last_seen',
        name='baseline-clients-last-seen',
        arguments=["script/run_glean_baseline_clients_last_seen"] + baseline_args,
        depends_on_past=True,
        **baseline_etl_kwargs
    )

    (copy_deduplicate_all >>
     baseline_clients_daily >>
     baseline_clients_last_seen)
Пример #14
0
    timedelta(minutes=30),
}

with DAG("public_analysis",
         default_args=default_args,
         schedule_interval="0 1 * * *") as dag:
    # Built from https://github.com/mozilla/forecasting/tree/master/anomdtct
    anomdtct_image = "gcr.io/moz-fx-data-forecasting/anomdtct:latest"

    anomdtct = GKEPodOperator(
        task_id="anomdtct",
        name="anomdtct",
        image=anomdtct_image,
        email=[
            "*****@*****.**",
            "*****@*****.**",
        ],
        arguments=["{{ds}}"] +
        ["--spreadsheet-id=" + Variable.get('anomdtct_spreadsheet_id')] +
        ["--spreadsheet-key=" + Variable.get('anomdtct_spreadsheet_api_key')],
        dag=dag,
    )

    wait_for_clients_first_seen = ExternalTaskSensor(
        task_id="wait_for_clients_first_seen",
        external_dag_id="main_summary",
        external_task_id="clients_first_seen",
        dag=dag,
    )

    anomdtct.set_upstream([
Пример #15
0
with DAG("jetstream", default_args=default_args,
         schedule_interval="0 4 * * *") as dag:

    # Built from repo https://github.com/mozilla/jetstream
    jetstream_image = "gcr.io/moz-fx-data-experiments/jetstream:latest"

    jetstream = GKEPodOperator(
        task_id="jetstream",
        name="jetstream",
        image=jetstream_image,
        email=[
            "*****@*****.**",
            "*****@*****.**",
            "*****@*****.**",
        ],
        arguments=[
            "run-argo",
            "--date={{ ds }}",
            # the Airflow cluster doesn't have Compute Engine API access so pass in IP
            # and certificate in order for the pod to connect to the Kubernetes cluster
            # running Jetstream
            "--cluster-ip={{ var.value.jetstream_cluster_ip }}",
            "--cluster-cert={{ var.value.jetstream_cluster_cert }}"
        ],
        dag=dag,
    )

    wait_for_clients_daily_export = ExternalTaskSensor(
        task_id="wait_for_clients_daily",
        external_dag_id="bqetl_main_summary",
        external_task_id="telemetry_derived__clients_daily__v6",
        execution_delta=timedelta(hours=2),
Пример #16
0
        "--iso-date={{ ds_nodash }}",
        "--gcp-project=%s" % TAAR_PROFILE_PROJECT_ID,
        "--avro-gcs-bucket=%s" % TAAR_ETL_STORAGE_BUCKET,
        "--bigtable-instance-id=%s" % TAAR_BIGTABLE_INSTANCE_ID,
        "--sample-rate=1.0",
        "--subnetwork=%s" % TAAR_DATAFLOW_SUBNETWORK,
    ]


wipe_gcs_bucket = GKEPodOperator(
    owner="*****@*****.**",
    email=[
        "*****@*****.**", "*****@*****.**", "*****@*****.**"
    ],
    task_id="wipe_taar_gcs_bucket",
    name="wipe_taar_gcs_bucket",
    image="google/cloud-sdk:242.0.0-alpine",
    arguments=wipe_gcs_files(),
    location="us-central1-a",
    cluster_name="bq-load-gke-1",
    dag=taar_weekly,
)

dump_bq_to_tmp_table = GKEPodOperator(
    owner="*****@*****.**",
    email=[
        "*****@*****.**", "*****@*****.**", "*****@*****.**"
    ],
    task_id="dump_bq_to_tmp_table",
    name="dump_bq_to_tmp_table",
    image=TAAR_ETL_CONTAINER_IMAGE,
Пример #17
0
    "email": ["*****@*****.**", "*****@*****.**", "*****@*****.**"],
    "email_on_failure": True,
    "email_on_retry": True,
    "retries": 0,
    "retry_delay": timedelta(minutes=30),
}

dag = DAG("taar_daily", default_args=default_args, schedule_interval="0 4 * * *")

amodump = GKEPodOperator(
    task_id="taar_amodump",
    name="taar-amodump",
    # This uses a circleci built docker image from github.com/mozilla/taar_gcp_etl
    image="gcr.io/moz-fx-data-airflow-prod-88e0/taar_gcp_etl:0.1",
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**", "*****@*****.**"],
    arguments=["-m", "taar_etl.taar_amodump", "--date", "{{ ds_nodash }}"],
    env_vars={
        "AWS_ACCESS_KEY_ID": taar_aws_access_key,
        "AWS_SECRET_ACCESS_KEY": taar_aws_secret_key,
    },
    dag=dag,
)

amowhitelist = GKEPodOperator(
    task_id="taar_amowhitelist",
    name="taar-amowhitelist",
    # This uses a circleci built docker image from github.com/mozilla/taar_gcp_etl
    image="gcr.io/moz-fx-data-airflow-prod-88e0/taar_gcp_etl:0.1",
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**", "*****@*****.**"],
    # We are extracting addons from the AMO server's APIs which don't
s3_to_gcs = S3ToGoogleCloudStorageTransferOperator(
    task_id='s3_to_gcs',
    s3_bucket='net-mozaws-data-us-west-2-data-analysis',
    gcs_bucket='moz-fx-data-derived-datasets-blpadi',
    description='blpadi copy from s3 to gcs',
    aws_conn_id='aws_data_iam_blpadi',
    gcp_conn_id=gcp_conn_id,
    project_id=connection.project_id,
    object_conditions=gcstj_object_conditions,
    transfer_options=gcstj_transfer_options,
    timeout=720,
    dag=blp_dag)

load_blpadi_to_bq = GKEPodOperator(task_id='bigquery_load',
                                   name='load-blpadi-to-bq',
                                   image='google/cloud-sdk:242.0.0-alpine',
                                   arguments=bq_args,
                                   dag=blp_dag)

blp_logs.set_downstream(blp_job_sensor)
blp_job_sensor.set_downstream(s3_to_gcs)
s3_to_gcs.set_downstream(load_blpadi_to_bq)

amo_dag = DAG('mango_log_processing_amo',
              default_args=DEFAULT_ARGS,
              dagrun_timeout=timedelta(hours=6),
              schedule_interval='0 3 * * *')

amo_logs = EmrCreateJobFlowOperator(task_id='amo_create_job_flow',
                                    job_flow_overrides={'Steps': AMO_STEPS},
                                    aws_conn_id='aws_data_iam',
Пример #19
0
         default_args=default_args,
         schedule_interval="0 2 * * *") as dag:

    # Make sure all the data for the given day has arrived before running.
    wait_for_main_ping = ExternalTaskSensor(
        task_id="wait_for_main_ping",
        external_dag_id="copy_deduplicate",
        external_task_id="copy_deduplicate_main_ping",
        execution_delta=timedelta(hours=1),
        check_existence=True,
        mode="reschedule",
        pool="DATA_ENG_EXTERNALTASKSENSOR",
        dag=dag,
    )

    # Built from repo https://github.com/mozilla/webrender_intel_win10_nightly
    webrender_ds_283 = GKEPodOperator(
        task_id="webrender_ds_283",
        name="webrender_ds_283",
        image="gcr.io/moz-fx-ds-283/ds_283_prod:latest",
        env_vars=dict(
            BUCKET="gs://moz-fx-ds-283",
            PROJECT_ID="moz-fx-data-shared-prod",
            # source dataset, results are written to the analysis dataset
            DATASET="telemetry",
        ),
        dag=dag,
    )

    wait_for_main_ping >> webrender_ds_283
Пример #20
0
def load_to_bigquery(parent_dag_name=None,
                     default_args=None,
                     dataset_s3_bucket=None,
                     aws_conn_id=None,
                     dataset=None,
                     dataset_version=None,
                     gke_cluster_name=None,
                     date_submission_col='submission_date_s3',
                     ds_type='ds_nodash',
                     dag_name='load_to_bigquery',
                     gke_location='us-central1-a',
                     gke_namespace='default',
                     docker_image='docker.io/mozilla/parquet2bigquery:20191017', # noqa
                     reprocess=False,
                     p2b_concurrency='10',
                     p2b_resume=False,
                     p2b_table_alias=None,
                     objects_prefix=None,
                     spark_gs_dataset_location=None,
                     bigquery_dataset='telemetry',
                     dataset_gcs_bucket='moz-fx-data-derived-datasets-parquet',
                     gcp_conn_id='google_cloud_derived_datasets',
                     cluster_by=(),
                     drop=(),
                     rename={},
                     replace=()):

    """ Load Parquet data into BigQuery. Used with SubDagOperator.

    We use S3ToGoogleCloudStorageTransferOperator to create a GCS Transfer
    Service job to transfer the AWS S3 parquet data into a GCS Bucket.
    Once that is completed we launch a Kubernates pod on a existing GKE
    cluster using the GKEPodOperator.

    :param str parent_dag_name:            parent dag name
    :param dict default_args:              dag configuration
    :param str dataset_s3_bucket:          source S3 Bucket
    :param str dataset_gcs_bucket:         destination GCS Bucket
    :param str aws_conn_id:                airflow connection id for S3 access
    :param str gcp_conn_id:                airflow connection id for GCP access
    :param str dataset:                    dataset name
    :param str dataset_version:            dataset version
    :param str date_submission_col:        dataset date submission column
    :param str ds_type:                    dataset format (ds or ds_nodash)
    :param str gke_location:               GKE cluster zone
    :param str gke_namespace:              GKE cluster namespace
    :param str docker_image:               docker image to use for GKE pod operations # noqa
    :param str bigquery_dataset:           bigquery load destination dataset
    :param str p2b_concurrency:            number of processes for parquet2bigquery load
    :param str p2b_table_alias:            override p2b table name with alias
    :param str p2b_resume                  allow resume support. defaults to False
    :param bool reprocess:                 enable dataset reprocessing defaults to False
    :param str objects_prefix:             custom objects_prefix to override defaults
    :param str spark_gs_dataset_location:  custom spark dataset load location to override defaults
    :param List[str] cluster_by:           top level fields to cluster by when creating destination table
    :param List[str] drop:                 top level fields to exclude from destination table
    :param Dict[str, str] rename:          top level fields to rename in destination table
    :param List[str] replace:              top level field replacement expressions

    :return airflow.models.DAG
    """

    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

    _dag_name = '{}.{}'.format(parent_dag_name, dag_name)

    if objects_prefix:
        _objects_prefix = objects_prefix
    else:
        _objects_prefix = '{}/{}/{}={{{{{}}}}}'.format(dataset,
                                                       dataset_version,
                                                       date_submission_col,
                                                       ds_type)
    gcs_buckets = {
        'transfer': dataset_gcs_bucket,
        'load': dataset_gcs_bucket,
    }

    gcstj_object_conditions = {
        'includePrefixes': _objects_prefix
    }

    gcstj_transfer_options = {
        'deleteObjectsUniqueInSink': True
    }

    gke_args = [
        '-d', bigquery_dataset,
        '-c', p2b_concurrency,
        '-b', gcs_buckets['load'],
        ]

    if not p2b_resume:
        gke_args += ['-R']

    if p2b_table_alias:
        gke_args += ['-a', p2b_table_alias]

    if reprocess:
        reprocess_objects_prefix = _objects_prefix.replace('_nodash', '')
        gcs_buckets['transfer'] += '-tmp'
        gke_args += ['-p', reprocess_objects_prefix]

    else:
        gke_args += ['-p', _objects_prefix]

    if cluster_by:
        gke_args += ['--cluster-by'] + cluster_by

    if drop:
        gke_args += ['--drop'] + drop

    if rename:
        gke_args += ['--rename'] + [k + "=" + v for k, v in rename.items()]

    if replace:
        gke_args += ['--replace'] + replace

    bq_table_name = p2b_table_alias or normalize_table_id('_'.join([dataset,
                                                                   dataset_version]))

    with models.DAG(_dag_name, default_args=default_args) as dag:
        if dataset_s3_bucket is not None:
            s3_to_gcs = S3ToGoogleCloudStorageTransferOperator(
                task_id='s3_to_gcs',
                s3_bucket=dataset_s3_bucket,
                gcs_bucket=gcs_buckets['transfer'],
                description=_objects_prefix,
                aws_conn_id=aws_conn_id,
                gcp_conn_id=gcp_conn_id,
                project_id=connection.project_id,
                object_conditions=gcstj_object_conditions,
                transfer_options=gcstj_transfer_options,
            )
        else:
            s3_to_gcs = DummyOperator(task_id='no_s3_to_gcs')

        reprocess = SubDagOperator(
            subdag=reprocess_parquet(
                _dag_name,
                default_args,
                reprocess,
                gcp_conn_id,
                gcs_buckets,
                _objects_prefix,
                date_submission_col,
                dataset,
                dataset_version,
                gs_dataset_location=spark_gs_dataset_location),
            task_id='reprocess_parquet')

        remove_bq_table = BigQueryTableDeleteOperator(
            task_id='remove_bq_table',
            bigquery_conn_id=gcp_conn_id,
            deletion_dataset_table='{}.{}${{{{ds_nodash}}}}'.format(bigquery_dataset, bq_table_name), # noqa
            ignore_if_missing=True,
        )

        bulk_load = GKEPodOperator(
            task_id='bigquery_load',
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            location=gke_location,
            cluster_name=gke_cluster_name,
            name=_dag_name.replace('_', '-'),
            namespace=gke_namespace,
            image=docker_image,
            arguments=gke_args,
            )

        s3_to_gcs >> reprocess >> remove_bq_table >> bulk_load

        return dag
Пример #21
0
    'crash_date'
]

# We remove the current date partition for idempotency.
remove_bq_table_partition = BigQueryTableDeleteOperator(
    task_id='remove_bq_table_partition',
    bigquery_conn_id=bq_gcp_conn_id,
    deletion_dataset_table='{}.{}${{{{ds_nodash}}}}'.format(
        bq_dataset, bq_table_name),  # noqa
    ignore_if_missing=True,
    dag=dag)

bq_load = GKEPodOperator(
    task_id='bigquery_load',
    gcp_conn_id=bq_gcp_conn_id,
    project_id=bq_connection.project_id,
    name='load-socorro-crash-parquet-to-bq',
    image=docker_image,
    arguments=gke_args,
    dag=dag,
)

register_status(
    bq_load,
    "Socorro Crash Reports Parquet",
    "Convert processed crash reports into parquet for analysis",
)

s3_to_gcs >> crash_report_parquet
crash_report_parquet >> remove_bq_table_partition >> bq_load
Пример #22
0
    "email_on_retry": True,
    "retries": 2,
}

with DAG(
    "bqetl_public_data_json", default_args=default_args, schedule_interval="0 4 * * *"
) as dag:
    docker_image = "mozilla/bigquery-etl:latest"

    export_public_data_json_telemetry_derived__ssl_ratios__v1 = GKEPodOperator(
        task_id="export_public_data_json_telemetry_derived__ssl_ratios__v1",
        name="export_public_data_json_telemetry_derived__ssl_ratios__v1",
        arguments=["script/publish_public_data_json"]
        + [
            "--query_file=sql/moz-fx-data-shared-prod/telemetry_derived/ssl_ratios_v1/query.sql"
        ]
        + ["--destination_table=ssl_ratios${{ds_nodash}}"]
        + ["--dataset_id=telemetry_derived"]
        + ["--project_id=moz-fx-data-shared-prod"]
        + ["--parameter=submission_date:DATE:{{ds}}"],
        image=docker_image,
        dag=dag,
    )

    wait_for_telemetry_derived__ssl_ratios__v1 = ExternalTaskSensor(
        task_id="wait_for_telemetry_derived__ssl_ratios__v1",
        external_dag_id="bqetl_ssl_ratios",
        external_task_id="telemetry_derived__ssl_ratios__v1",
        execution_delta=datetime.timedelta(seconds=7200),
        check_existence=True,
        mode="reschedule",
        pool="DATA_ENG_EXTERNALTASKSENSOR",
        execution_delta=datetime.timedelta(hours=1),
        dag=dag,
    )

    fission_monitoring_crash_v1 = bigquery_etl_query(
        task_id="fission_monitoring_crash_v1",
        project_id="moz-fx-data-shared-prod",
        destination_table="fission_monitoring_crash_v1",
        dataset_id="telemetry_derived",
    )

    # Built from https://github.com/mozilla/fission_monitoring_nightly
    fission_aggregation_for_dashboard = GKEPodOperator(
        task_id="fission_aggregation_for_dashboard",
        name="fission_aggregation_for_dashboard",
        image="gcr.io/moz-fx-data-airflow-prod-88e0/fission-monitoring:latest",
        env_vars=dict(
            BQ_BILLING_PROJECT_ID="moz-fx-data-shared-prod",
            BQ_INPUT_MAIN_TABLE="moz-fx-data-shared-prod.telemetry_derived.fission_monitoring_main_v1",
            BQ_INPUT_CRASH_TABLE="moz-fx-data-shared-prod.telemetry_derived.fission_monitoring_crash_v1",
            BQ_OUTPUT_TABLE="moz-fx-data-shared-prod.analysis.fission_monitoring_analyzed_v1",
            GCS_BUCKET="fission-experiment-monitoring-dashboard",
        ),
        image_pull_policy="Always",
        dag=dag,
    )

    wait_for_copy_deduplicate_main_ping >> fission_monitoring_main_v1
    wait_for_copy_deduplicate_crash_ping >> fission_monitoring_crash_v1
    [fission_monitoring_main_v1, fission_monitoring_crash_v1] >> fission_aggregation_for_dashboard
Пример #24
0
default_args = {
    'owner': '*****@*****.**',
    'depends_on_past': False,
    'start_date': datetime(2019, 12, 26),
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG("clean-gke-pods",
          default_args=default_args,
          schedule_interval="@daily",
          doc_md=docs)

docker_image = 'gcr.io/moz-fx-data-airflow-prod-88e0/gke-pod-clean:1.3'
gke_cluster_name = 'bq-load-gke-1'
gke_location = 'us-central1-a'

docker_args = [
    '--project', 'moz-fx-data-derived-datasets', '--gke-cluster',
    gke_cluster_name, '--region', gke_location, '--retention-days', '2'
]

clean_gke_pods = GKEPodOperator(task_id="clean-gke-pods",
                                name='clean-gke-pods',
                                image=docker_image,
                                arguments=docker_args,
                                dag=dag)
Пример #25
0
              default_args=DEFAULT_ARGS,
              dagrun_timeout=timedelta(hours=6),
              schedule_interval='@weekly')

create_gke_cluster = GKEClusterCreateOperator(task_id='create_gke_cluster',
                                              project_id=connection.project_id,
                                              location='us-west1-b',
                                              gcp_conn_id=gcp_conn_id,
                                              body=cluster_def,
                                              dag=gke_dag)

run_prio = GKEPodOperator(task_id='run_prio_a',
                          gcp_conn_id=gcp_conn_id,
                          project_id=connection.project_id,
                          location='us-west1-b',
                          cluster_name=cluster_name,
                          name='run-prio-project-a',
                          namespace='default',
                          image='mozilla/python-libprio:latest',
                          arguments=['scripts/test-cli-integration'],
                          dag=gke_dag)

delete_gke_cluster = GKEClusterDeleteOperator(task_id='delete_gke_cluster',
                                              project_id=connection.project_id,
                                              location='us-west1-b',
                                              name=cluster_name,
                                              gcp_conn_id=gcp_conn_id,
                                              dag=gke_dag)

create_gke_cluster.set_downstream(run_prio)
run_prio.set_downstream(delete_gke_cluster)
Пример #26
0
insert_args = [
    'bq',
    '--location=US',
    'query',
    '--replace',
    '--destination_table',
    'moz-fx-data-derived-datasets:blpadi.adi_dimensional_by_date${{ ds_nodash }}',
    '--use_legacy_sql=false',
    "select tot_requests_on_date, _year_quarter, bl_date, product, v_prod_major, prod_os, v_prod_os, channel, locale, continent_code, cntry_code, distro_name, distro_version from blpadi.adi_dim_backfill where bl_date = '{{ ds }}'",
]

load_bq_to_tmp_tbl = GKEPodOperator(task_id='bq_load_tmp_tbl',
                                    gcp_conn_id=gcp_conn_id,
                                    project_id=connection.project_id,
                                    location='us-central1-a',
                                    cluster_name='bq-load-gke-1',
                                    name='bq-load-tmp-tbl',
                                    namespace='default',
                                    image='google/cloud-sdk:242.0.0-alpine',
                                    arguments=load_args,
                                    dag=blp_dag)

select_insert_into_final_table = GKEPodOperator(
    task_id='bigquery_insert_final_table',
    gcp_conn_id=gcp_conn_id,
    project_id='moz-fx-data-derived-datasets',
    location='us-central1-a',
    cluster_name='bq-load-gke-1',
    name='bq-query-insert-final-tbl',
    namespace='default',
    image='google/cloud-sdk:242.0.0-alpine',
    arguments=insert_args,
Пример #27
0
}

dag = DAG("taar_amodump",
          default_args=default_args,
          schedule_interval="@daily")

amodump = GKEPodOperator(
    task_id="taar_amodump",
    gcp_conn_id=gcp_conn_id,
    project_id=connection.project_id,
    location="us-central1-a",
    cluster_name=gke_cluster_name,
    name="taar-amodump",
    namespace="default",
    # This uses a circleci built docker image from github.com/mozilla/taar_gcp_etl
    image="gcr.io/moz-fx-data-airflow-prod-88e0/taar_gcp_etl:0.1",
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**", "*****@*****.**"],
    arguments=["-m", "taar_etl.taar_amodump", "--date", "{{ ds_nodash }}"],
    env_vars={
        "AWS_ACCESS_KEY_ID": aws_access_key,
        "AWS_SECRET_ACCESS_KEY": aws_secret_key,
    },
    dag=dag,
)

amowhitelist = GKEPodOperator(
    task_id="taar_amowhitelist",
    gcp_conn_id=gcp_conn_id,
    project_id=connection.project_id,
    location="us-central1-a",
Пример #28
0
    "retries": 2,
    "retry_delay": timedelta(minutes=30),
}

with DAG("pensieve", default_args=default_args,
         schedule_interval="0 1 * * *") as dag:

    # Built from repo https://github.com/mozilla/pensieve
    pensieve_image = "gcr.io/moz-fx-data-experiments/pensieve:latest"

    pensieve = GKEPodOperator(
        task_id="pensieve",
        name="pensieve",
        image=pensieve_image,
        email=[
            "*****@*****.**",
            "*****@*****.**",
            "*****@*****.**",
        ],
        arguments=["--date={{ds}}"],
        dag=dag,
    )

    wait_for_clients_daily_export = ExternalTaskSensor(
        task_id="wait_for_clients_daily_export",
        external_dag_id="main_summary",
        external_task_id="clients_daily_export",
        dag=dag,
    )

    wait_for_main_summary_export = ExternalTaskSensor(
        task_id="wait_for_main_summary_export",
Пример #29
0
    task_id='s3_to_gcs',
    s3_bucket='net-mozaws-data-us-west-2-data-analysis',
    gcs_bucket='moz-fx-data-derived-datasets-blpadi',
    description='blpadi copy from s3 to gcs',
    aws_conn_id='aws_data_iam_blpadi',
    gcp_conn_id=gcp_conn_id,
    project_id=connection.project_id,
    object_conditions=gcstj_object_conditions,
    transfer_options=gcstj_transfer_options,
    dag=blp_dag)

load_blpadi_to_bq = GKEPodOperator(task_id='bigquery_load',
                                   gcp_conn_id=gcp_conn_id,
                                   project_id=connection.project_id,
                                   location='us-central1-a',
                                   cluster_name='bq-load-gke-1',
                                   name='load-blpadi-to-bq',
                                   namespace='default',
                                   image='google/cloud-sdk:242.0.0-alpine',
                                   arguments=bq_args,
                                   dag=blp_dag)

blp_logs.set_downstream(blp_job_sensor)
blp_job_sensor.set_downstream(s3_to_gcs)
s3_to_gcs.set_downstream(load_blpadi_to_bq)

amo_dag = DAG('mango_log_processing_amo',
              default_args=DEFAULT_ARGS,
              dagrun_timeout=timedelta(hours=6),
              schedule_interval='0 3 * * *')

amo_logs = EmrCreateJobFlowOperator(task_id='amo_create_job_flow',
Пример #30
0
    dataset_id="telemetry_derived",
    dag=dag,
)

user_activity_usage_behavior_export = GKEPodOperator(
    task_id="user_activity_export",
    name="user_activity_export",
    image=
    "gcr.io/moz-fx-data-airflow-prod-88e0/firefox-public-data-report-etl:latest",
    arguments=[
        "-m",
        "public_data_report.cli",
        "user_activity",
        "--bq_table",
        "moz-fx-data-shared-prod.telemetry_derived.public_data_report_user_activity_v1",
        "--s3_bucket",
        "telemetry-public-analysis-2",
        "--s3_path",
        "public-data-report/user_activity",
    ],
    env_vars={
        "AWS_ACCESS_KEY_ID": aws_access_key,
        "AWS_SECRET_ACCESS_KEY": aws_secret_key,
    },
    image_pull_policy="Always",
    dag=dag,
)

annotations_export = GKEPodOperator(
    task_id="annotations_export",
    name="annotations_export",