Пример #1
0
def get_dataproc_parameters(conn_id="google_cloud_airflow_dataproc"):
    """This function can be used to gather parameters that correspond to development
    parameters. The provided connection string should be a Google Cloud connection
    and should either be the production default ("dataproc-runner-prod"), or a
    service key associated with a sandbox account.
    """
    gcp_conn = GoogleCloudBaseHook(conn_id)
    keyfile = json.loads(
        gcp_conn.extras["extra__google_cloud_platform__keyfile_dict"])

    project_id = keyfile["project_id"]
    is_dev = os.environ.get("DEPLOY_ENVIRONMENT") == "dev"
    client_email = (
        keyfile["client_email"] if is_dev else
        "*****@*****.**")
    artifact_bucket = ("{}-dataproc-artifacts".format(project_id) if is_dev
                       else "moz-fx-data-prod-airflow-dataproc-artifacts")
    storage_bucket = ("{}-dataproc-scratch".format(project_id)
                      if is_dev else "moz-fx-data-prod-dataproc-scratch")
    output_bucket = (artifact_bucket
                     if is_dev else "moz-fx-data-derived-datasets-parquet")
    return DataprocParameters(conn_id, project_id, is_dev, client_email,
                              artifact_bucket, storage_bucket, output_bucket)
Пример #2
0
def bigquery_xcom_query(
    destination_table,
    dataset_id,
    xcom_task_id,
    parameters=(),
    arguments=(),
    project_id=None,
    gcp_conn_id="google_cloud_derived_datasets",
    gke_location="us-central1-a",
    gke_cluster_name="bq-load-gke-1",
    gke_namespace="default",
    docker_image="mozilla/bigquery-etl:latest",
    image_pull_policy="Always",
    date_partition_parameter="submission_date",
    **kwargs
):
    """ Generate a GKEPodOperator which runs an xcom result as a bigquery query.

    :param str destination_table:                  [Required] BigQuery destination table
    :param str dataset_id:                         [Required] BigQuery default dataset id
    :param str xcom_task_id:                       [Required] task_id which generated the xcom to pull
    :param Tuple[str] parameters:                  Parameters passed to bq query
    :param Tuple[str] arguments:                   Additional bq query arguments
    :param Optional[str] project_id:               BigQuery default project id
    :param str gcp_conn_id:                        Airflow connection id for GCP access
    :param str gke_location:                       GKE cluster location
    :param str gke_cluster_name:                   GKE cluster name
    :param str gke_namespace:                      GKE cluster namespace
    :param str docker_image:                       docker image to use
    :param str image_pull_policy:                  Kubernetes policy for when to pull
                                                   docker_image
    :param Optional[str] date_partition_parameter: Parameter for indicating destination
                                                   partition to generate, if None
                                                   destination should be whole table
                                                   rather than partition
    :param Dict[str, Any] kwargs:                  Additional keyword arguments for
                                                   GKEPodOperator

    :return: GKEPodOperator
    """
    kwargs["task_id"] = kwargs.get("task_id", destination_table)
    kwargs["name"] = kwargs.get("name", kwargs["task_id"].replace("_", "-"))
    if destination_table is not None and date_partition_parameter is not None:
        destination_table = destination_table + "${{ds_nodash}}"
        parameters += (date_partition_parameter + ":DATE:{{ds}}",)
    query = "{{ " + "task_instance.xcom_pull({!r})".format(xcom_task_id) + " }}"
    return GKEPodOperator(
        gcp_conn_id=gcp_conn_id,
        project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
        location=gke_location,
        cluster_name=gke_cluster_name,
        namespace=gke_namespace,
        image=docker_image,
        arguments=["bq"]
        + ["query"]
        + (["--destination_table=" + destination_table] if destination_table else [])
        + ["--dataset_id=" + dataset_id]
        + (["--project_id=" + project_id] if project_id else [])
        + ["--parameter=" + parameter for parameter in parameters]
        + list(arguments)
        + [query],
        image_pull_policy=image_pull_policy,
        **kwargs
    )
Пример #3
0
def bigquery_etl_copy_deduplicate(
    task_id,
    target_project_id,
    only_tables=None,
    except_tables=None,
    parallelism=4,
    priority="INTERACTIVE",
    hourly=False,
    slices=None,
    gcp_conn_id="google_cloud_derived_datasets",
    gke_location="us-central1-a",
    gke_cluster_name="bq-load-gke-1",
    gke_namespace="default",
    docker_image="mozilla/bigquery-etl:latest",
    image_pull_policy="Always",
    **kwargs
):
    """ Copy a day's data from live ping tables to stable ping tables,
    deduplicating on document_id.

    :param str task_id:              [Required] ID for the task
    :param str target_project_id:    [Required] ID of project where target tables live
    :param Tuple[str] only_tables:   Only process tables matching the given globs of form 'telemetry_live.main_v*'
    :param Tuple[str] except_tables: Process all tables except those matching the given globs
    :param int parallelism:          Maximum number of queries to execute concurrently
    :param str priority:             BigQuery query priority to use, must be BATCH or INTERACTIVE
    :param bool hourly:              Alias for --slices=24
    :param int slices:               Number of time-based slices to deduplicate in, rather than for whole days at once
    :param str gcp_conn_id:          Airflow connection id for GCP access
    :param str gke_location:         GKE cluster location
    :param str gke_cluster_name:     GKE cluster name
    :param str gke_namespace:        GKE cluster namespace
    :param str docker_image:         docker image to use
    :param str image_pull_policy:    Kubernetes policy for when to pull
                                     docker_image
    :param Dict[str, Any] kwargs:    Additional keyword arguments for
                                     GKEPodOperator

    :return: GKEPodOperator
    """
    kwargs["name"] = kwargs.get("name", task_id.replace("_", "-"))
    table_qualifiers = []
    if only_tables:
        table_qualifiers.append('--only')
        table_qualifiers += only_tables
    if except_tables:
        table_qualifiers.append('--except')
        table_qualifiers += except_tables
    return GKEPodOperator(
        task_id=task_id,
        gcp_conn_id=gcp_conn_id,
        project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
        location=gke_location,
        cluster_name=gke_cluster_name,
        namespace=gke_namespace,
        image=docker_image,
        arguments=["script/copy_deduplicate"]
        + ["--project-id=" + target_project_id]
        + ["--date={{ds}}"]
        + ["--parallelism={}".format(parallelism)]
        + ["--priority={}".format(priority)]
        + (["--hourly"] if hourly else [])
        + (["--slices={}".format(slices)] if slices is not None else [])
        + table_qualifiers,
        image_pull_policy=image_pull_policy,
        **kwargs
    )
Пример #4
0
def bigquery_etl_query(
    destination_table,
    dataset_id,
    parameters=(),
    arguments=(),
    project_id=None,
    sql_file_path=None,
    gcp_conn_id="google_cloud_derived_datasets",
    gke_location="us-central1-a",
    gke_cluster_name="bq-load-gke-1",
    gke_namespace="default",
    docker_image="mozilla/bigquery-etl:latest",
    image_pull_policy="Always",
    date_partition_parameter="submission_date",
    multipart=False,
    **kwargs
):
    """ Generate.

    :param str destination_table:                  [Required] BigQuery destination table
    :param str dataset_id:                         [Required] BigQuery default dataset id
    :param Tuple[str] parameters:                  Parameters passed to bq query
    :param Tuple[str] arguments:                   Additional bq query arguments
    :param Optional[str] project_id:               BigQuery default project id
    :param Optional[str] sql_file_path:            Optional override for path to the
                                                   SQL query file to run
    :param str gcp_conn_id:                        Airflow connection id for GCP access
    :param str gke_location:                       GKE cluster location
    :param str gke_cluster_name:                   GKE cluster name
    :param str gke_namespace:                      GKE cluster namespace
    :param str docker_image:                       docker image to use
    :param str image_pull_policy:                  Kubernetes policy for when to pull
                                                   docker_image
    :param Optional[str] date_partition_parameter: Parameter for indicating destination
                                                   partition to generate, if None
                                                   destination should be whole table
                                                   rather than partition
    :param Dict[str, Any] kwargs:                  Additional keyword arguments for
                                                   GKEPodOperator

    :return: GKEPodOperator
    """
    kwargs["task_id"] = kwargs.get("task_id", destination_table)
    kwargs["name"] = kwargs.get("name", kwargs["task_id"].replace("_", "-"))
    sql_file_path = sql_file_path or "sql/{}/{}/query.sql".format(dataset_id, destination_table)
    if destination_table is not None and date_partition_parameter is not None:
        destination_table = destination_table + "${{ds_nodash}}"
        parameters += (date_partition_parameter + ":DATE:{{ds}}",)
    return GKEPodOperator(
        gcp_conn_id=gcp_conn_id,
        project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
        location=gke_location,
        cluster_name=gke_cluster_name,
        namespace=gke_namespace,
        image=docker_image,
        arguments=["script/run_multipart_query" if multipart else "query"]
        + (["--destination_table=" + destination_table] if destination_table else [])
        + ["--dataset_id=" + dataset_id]
        + (["--project_id=" + project_id] if project_id else [])
        + ["--parameter=" + parameter for parameter in parameters]
        + list(arguments)
        + [sql_file_path],
        image_pull_policy=image_pull_policy,
        **kwargs
    )
Пример #5
0
def export_to_parquet(
    table,
    destination_table=None,
    static_partitions=[],
    arguments=[],
    use_storage_api=False,
    dag_name="export_to_parquet",
    parent_dag_name=None,
    default_args=None,
    aws_conn_id="aws_dev_iam_s3",
    gcp_conn_id="google_cloud_derived_datasets",
    dataproc_zone="us-central1-a",
    dataproc_storage_bucket="moz-fx-data-derived-datasets-parquet",
    num_workers=2,
    num_preemptible_workers=0,
    gcs_output_bucket="moz-fx-data-derived-datasets-parquet",
    s3_output_bucket="telemetry-parquet",
):

    """ Export a BigQuery table to Parquet.

    https://github.com/mozilla/bigquery-etl/blob/master/script/pyspark/export_to_parquet.py

    :param str table:                             [Required] BigQuery table name
    :param Optional[str] destination_table:       Output table name, defaults to table,
                                                  will have r'_v[0-9]+$' replaced with
                                                  r'/v[0-9]+'
    :param List[str] arguments:                   Additional pyspark arguments
    :param bool use_storage_api:                  Whether to read from the BigQuery
                                                  Storage API or an AVRO export
    :param str dag_name:                          Name of DAG
    :param Optional[str] parent_dag_name:         Parent DAG name
    :param Optional[Dict[str, Any]] default_args: DAG configuration
    :param str gcp_conn_id:                       Airflow connection id for GCP access
    :param str dataproc_storage_bucket:           Dataproc staging GCS bucket
    :param str dataproc_zone:                     GCP zone to launch dataproc clusters
    :param int num_preemptible_workers:           Number of Dataproc preemptible workers

    :return: airflow.models.DAG
    """

    # remove the dataset prefix and partition suffix from table
    table_id = table.rsplit(".", 1)[-1]
    unqualified_table, _, partition_id = table_id.partition("$")
    # limit cluster name to 35 characters plus suffix of -export-YYYYMMDD (51 total)
    cluster_name = unqualified_table.replace("_", "-")
    if len(cluster_name) > 35:
        # preserve version when truncating cluster name to 42 characters
        prefix, version = re.match(r"(.*?)(-v[0-9]+)?$", cluster_name).groups("")
        cluster_name = prefix[:35 - len(version)] + version
    cluster_name += "-export-{{ ds_nodash }}"

    dag_prefix = parent_dag_name + "." if parent_dag_name else ""
    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

    if destination_table is None:
        destination_table = unqualified_table
    # separate version using "/" instead of "_"
    export_prefix = re.sub(r"_(v[0-9]+)$", r"/\1", destination_table) + "/"
    if static_partitions:
        export_prefix += "/".join(static_partitions) + "/"
    avro_prefix = "avro/" + export_prefix
    if not static_partitions and partition_id:
        avro_prefix += "partition_id=" + partition_id + "/"
    avro_path = "gs://" + gcs_output_bucket + "/" + avro_prefix + "*.avro"

    with models.DAG(dag_id=dag_prefix + dag_name, default_args=default_args) as dag:

        create_dataproc_cluster = DataprocClusterCreateOperator(
            task_id="create_dataproc_cluster",
            cluster_name=cluster_name,
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            num_workers=num_workers,
            image_version="1.4",
            storage_bucket=dataproc_storage_bucket,
            zone=dataproc_zone,
            master_machine_type="n1-standard-8",
            worker_machine_type="n1-standard-8",
            num_preemptible_workers=num_preemptible_workers,
            init_actions_uris=[
                "gs://dataproc-initialization-actions/python/pip-install.sh",
            ],
            metadata={"PIP_PACKAGES": "google-cloud-bigquery==1.20.0"},
        )

        run_dataproc_pyspark = DataProcPySparkOperator(
            task_id="run_dataproc_pyspark",
            cluster_name=cluster_name,
            dataproc_pyspark_jars=[
                "gs://spark-lib/bigquery/spark-bigquery-latest.jar"
            ],
            dataproc_pyspark_properties={
                "spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.4",
            },
            main="https://raw.githubusercontent.com/mozilla/bigquery-etl/master"
            "/script/pyspark/export_to_parquet.py",
            arguments=[table]
            + [
                "--" + key + "=" + value
                for key, value in {
                    "avro-path": (not use_storage_api) and avro_path,
                    "destination": "gs://" + gcs_output_bucket,
                    "destination-table": destination_table,
                }.items()
                if value
            ]
            + (["--static-partitions"] if static_partitions else [])
            + [static_partitions]
            + arguments,
            gcp_conn_id=gcp_conn_id,
        )

        gcs_to_s3 = DataProcHadoopOperatorWithAws(
            task_id="gcs_to_s3",
            main_jar="file:///usr/lib/hadoop-mapreduce/hadoop-distcp.jar",
            arguments=[
                "-update",
                "-delete",
                "gs://{}/{}".format(gcs_output_bucket, export_prefix),
                "s3a://{}/{}".format(s3_output_bucket, export_prefix),
            ],
            cluster_name=cluster_name,
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            aws_conn_id=aws_conn_id,
        )

        delete_dataproc_cluster = DataprocClusterDeleteOperator(
            task_id="delete_dataproc_cluster",
            cluster_name=cluster_name,
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
        )

        if not use_storage_api:
            avro_export = BigQueryToCloudStorageOperator(
                task_id="avro_export",
                source_project_dataset_table=table,
                destination_cloud_storage_uris=avro_path,
                compression=None,
                export_format="AVRO",
                bigquery_conn_id=gcp_conn_id,
            )
            avro_delete = GoogleCloudStorageDeleteOperator(
                task_id="avro_delete",
                bucket_name=gcs_output_bucket,
                prefix=avro_prefix,
                gcp_conn_id=gcp_conn_id,
                trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
            )
            avro_export >> run_dataproc_pyspark >> avro_delete

        create_dataproc_cluster >> run_dataproc_pyspark >> gcs_to_s3
        gcs_to_s3 >> delete_dataproc_cluster

        return dag
Пример #6
0
def spark_subdag(
    parent_dag_name,
    child_dag_name,
    default_args,
    gcp_conn_id,
    service_account,
    main,
    pyfiles,
    arguments,
    dataproc_zone="us-west1-a",
    num_preemptible_workers=10,
):
    """Run the PySpark job for unnesting and range-partitioning Prio pings from
    the ingestion service.

    :param str parent_dag_name:         Name of the parent DAG.
    :param str child_dag_name:          Name of the child DAG.
    :param Dict[str, Any] default_args: Default arguments for the child DAG.
    :param str gcp_conn_id:             Name of the connection string.
    :param str service_account:         The address of the service account.
    :param str dataproc_zone:           The region of the DataProc cluster.
    :param str main:
    :param List[str] pyfiles:
    :param List[str] arguments:
    :param int num_preemptible_workers: The number of preemptible workers.
    :return: DAG
    """

    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

    shared_config = {
        "cluster_name": "prio-staging",
        "gcp_conn_id": gcp_conn_id,
        "project_id": connection.project_id,
    }

    with DAG("{}.{}".format(parent_dag_name, child_dag_name),
             default_args=default_args) as dag:
        create_dataproc_cluster = DataprocClusterCreateOperator(
            task_id="create_dataproc_cluster",
            num_workers=2,
            image_version="1.4",
            zone=dataproc_zone,
            service_account=service_account,
            master_machine_type="n1-standard-8",
            worker_machine_type="n1-standard-8",
            num_preemptible_workers=num_preemptible_workers,
            metadata={"PIP_PACKAGES": "click jsonschema gcsfs==0.2.3"},
            init_actions_uris=[
                "gs://dataproc-initialization-actions/python/pip-install.sh"
            ],
            dag=dag,
            **shared_config)

        run_dataproc_spark = DataProcPySparkOperator(
            task_id="run_dataproc_spark",
            main=main,
            dataproc_pyspark_jars=[
                "gs://spark-lib/bigquery/spark-bigquery-latest.jar"
            ],
            pyfiles=pyfiles,
            arguments=arguments,
            dag=dag,
            **shared_config)

        delete_dataproc_cluster = DataprocClusterDeleteOperator(
            task_id="delete_dataproc_cluster",
            trigger_rule="all_done",
            dag=dag,
            **shared_config)
        create_dataproc_cluster >> run_dataproc_spark >> delete_dataproc_cluster
        return dag
Пример #7
0
                                    emr_conn_id='emr_data_iam_mango',
                                    dag=blp_dag)

blp_job_sensor = EmrJobFlowSensor(
    task_id='blp_check_job_flow',
    job_flow_id=
    "{{ task_instance.xcom_pull('blp_create_job_flow', key='return_value') }}",
    aws_conn_id='aws_data_iam',
    dag=blp_dag,
    on_retry_callback=lambda context: blp_dag.clear(
        start_date=context['execution_date'],
        end_date=context['execution_date']),
)

gcp_conn_id = "google_cloud_derived_datasets"
connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

gcstj_object_conditions = {'includePrefixes': 'blpadi/{{ ds }}'}

gcstj_transfer_options = {'deleteObjectsUniqueInSink': True}

bq_args = [
    'bq',
    '--location=US',
    'load',
    '--source_format=CSV',
    '--skip_leading_rows=0',
    '--replace',
    "--field_delimiter=\001",
    'blpadi.adi_dimensional_by_date${{ ds_nodash }}',
    'gs://moz-fx-data-derived-datasets-blpadi/blpadi/{{ ds }}/*',
Пример #8
0
    3,
    "retry_delay":
    timedelta(minutes=30),
}

dag = DAG(
    "prerelease_telemetry_aggregates",
    default_args=default_args,
    schedule_interval="@daily",
)

subdag_args = default_args.copy()
subdag_args["retries"] = 0

task_id = "prerelease_telemetry_aggregate_view_dataproc"
gcp_conn = GoogleCloudBaseHook("google_cloud_airflow_dataproc")
keyfile = json.loads(
    gcp_conn.extras["extra__google_cloud_platform__keyfile_dict"])
project_id = keyfile["project_id"]

is_dev = os.environ.get("DEPLOY_ENVIRONMENT") == "dev"
client_email = (
    keyfile["client_email"] if is_dev else
    "*****@*****.**")
artifact_bucket = ("{}-dataproc-artifacts".format(project_id) if is_dev else
                   "moz-fx-data-prod-airflow-dataproc-artifacts")
storage_bucket = ("{}-dataproc-scratch".format(project_id)
                  if is_dev else "moz-fx-data-prod-dataproc-scratch")

prerelease_telemetry_aggregate_view_dataproc = SubDagOperator(
    task_id=task_id,
Пример #9
0
    def __init__(
        self,
        cluster_name=None,
        num_workers=2,
        image_version='1.4',
        zone='us-west1-b',
        idle_delete_ttl='14400',
        auto_delete_ttl='28800',
        master_machine_type='n1-standard-8',
        worker_machine_type='n1-standard-4',
        num_preemptible_workers=0,
        service_account='*****@*****.**',
        init_actions_uris=None,
        additional_metadata=None,
        additional_properties=None,
        optional_components=['ANACONDA'],
        install_component_gateway=True,
        aws_conn_id=None,
        gcp_conn_id='google_cloud_airflow_dataproc',
        artifact_bucket='moz-fx-data-prod-airflow-dataproc-artifacts',
        storage_bucket='moz-fx-data-prod-dataproc-scratch',
    ):

        self.cluster_name = cluster_name
        self.num_workers = num_workers
        self.image_version = image_version
        self.zone = zone
        self.idle_delete_ttl = idle_delete_ttl
        self.auto_delete_ttl = auto_delete_ttl
        self.master_machine_type = master_machine_type
        self.worker_machine_type = worker_machine_type
        self.num_preemptible_workers = num_preemptible_workers
        self.service_account = service_account
        # The bucket with a default dataproc init script
        self.artifact_bucket = artifact_bucket
        self.storage_bucket = storage_bucket

        if init_actions_uris is None:
            self.init_actions_uris = [
                'gs://{}/bootstrap/dataproc_init.sh'.format(
                    self.artifact_bucket)
            ]
        else:
            self.init_actions_uris = init_actions_uris

        if additional_metadata is None:
            self.additional_metadata = {}
        else:
            self.additional_metadata = additional_metadata

        if additional_properties is None:
            self.additional_properties = {}
        else:
            self.additional_properties = additional_properties

        self.optional_components = optional_components
        self.install_component_gateway = install_component_gateway
        self.aws_conn_id = aws_conn_id
        self.gcp_conn_id = gcp_conn_id

        self.connection = GoogleCloudBaseHook(gcp_conn_id=self.gcp_conn_id)
Пример #10
0
def export_to_parquet(
    table,
    arguments=[],
    dag_name="export_to_parquet",
    parent_dag_name=None,
    default_args=None,
    aws_conn_id="aws_dev_iam_s3",
    gcp_conn_id="google_cloud_derived_datasets",
    dataproc_zone="us-central1-a",
    dataproc_storage_bucket="moz-fx-data-derived-datasets-parquet",
    num_preemptible_workers=0,
):

    """ Export a BigQuery table to Parquet.

    https://github.com/mozilla/bigquery-etl/blob/master/script/pyspark/export_to_parquet.py

    :param str table:                             [Required] BigQuery table name
    :param List[str] arguments:                   Additional pyspark arguments
    :param str dag_name:                          Name of DAG
    :param Optional[str] parent_dag_name:         Parent DAG name
    :param Optional[Dict[str, Any]] default_args: DAG configuration
    :param str gcp_conn_id:                       Airflow connection id for GCP access
    :param str dataproc_storage_bucket:           Dataproc staging GCS bucket
    :param str dataproc_zone:                     GCP zone to launch dataproc clusters
    :param int num_preemptible_workers:           Number of Dataproc preemptible workers

    :return: airflow.models.DAG
    """

    # limit cluster name to 42 characters then suffix with -YYYYMMDD
    cluster_name = table.replace("_", "-")
    if len(cluster_name) > 42:
        if cluster_name.rsplit("-v", 1)[-1].isdigit():
            prefix, version = cluster_name.rsplit("-v", 1)
            cluster_name = prefix[:40 - len(version)] + "-v" + version
        else:
            cluster_name = cluster_name[:42]
    cluster_name += "-{{ ds_nodash }}"

    dag_prefix = parent_dag_name + "." if parent_dag_name else ""
    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)
    properties = {
        "core:fs.s3a." + key: value
        for key, value in zip(
            ("access.key", "secret.key", "session.token"),
            AwsHook(aws_conn_id).get_credentials(),
        )
        if value is not None
    }

    with models.DAG(dag_id=dag_prefix + dag_name, default_args=default_args) as dag:

        create_dataproc_cluster = DataprocClusterCreateOperator(
            task_id="create_dataproc_cluster",
            cluster_name=cluster_name,
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            properties=properties,
            num_workers=2,
            image_version="1.3",
            storage_bucket=dataproc_storage_bucket,
            zone=dataproc_zone,
            master_machine_type="n1-standard-8",
            worker_machine_type="n1-standard-8",
            num_preemptible_workers=num_preemptible_workers,
        )

        run_dataproc_pyspark = DataProcPySparkOperator(
            task_id="run_dataproc_pyspark",
            cluster_name=cluster_name,
            dataproc_pyspark_jars=[
                "gs://mozilla-bigquery-etl/jars/spark-bigquery-0.5.1-beta-SNAPSHOT.jar"
            ],
            main="https://raw.githubusercontent.com/mozilla/bigquery-etl/master"
            "/script/pyspark/export_to_parquet.py",
            arguments=[table] + arguments,
            gcp_conn_id=gcp_conn_id,
        )

        delete_dataproc_cluster = DataprocClusterDeleteOperator(
            task_id="delete_dataproc_cluster",
            cluster_name=cluster_name,
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
        )

        create_dataproc_cluster >> run_dataproc_pyspark >> delete_dataproc_cluster

        return dag
Пример #11
0
def container_subdag(
    parent_dag_name,
    child_dag_name,
    default_args,
    gcp_conn_id,
    service_account,
    server_id,
    env_vars={},
    arguments=[],
    machine_type="n1-standard-1",
    image="mozilla/prio-processor:v3.0.1",
    location="us-west1-a",
    owner_label="amiyaguchi",
    team_label="dataeng",
    **kwargs,
):
    """Run a command on an ephemeral container running the
    `mozilla/prio-processor:latest` image.

    :param str parent_dag_name:         Name of the parent DAG.
    :param str child_dag_name:          Name of the child DAG.
    :param Dict[str, Any] default_args: Default arguments for the child DAG.
    :param str gcp_conn_id:             Name of the connection string.
    :param str service_account:         The address of the service account.
    :param str server_id:               The identifier for the Prio processor
    :param Dict[str, str] env_vars:     Environment variables for configuring
                                        the processor.
    :param List[str] arguments:         The command to run after loading the
                                        image.
    :param str machine_type:            The machine type for running the image.
    :param str image:                   Dockerhub image
    :param str location:                The region of the GKE cluster.
    :param str owner_label:             Label for associating the owner
    :param str team_label:              Label for associating the team
    :return: DAG
    """
    assert server_id in ["a", "b", "admin"]

    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

    cluster_name = f"gke-prio-{server_id}"

    shared_config = {
        "project_id": connection.project_id,
        "gcp_conn_id": gcp_conn_id,
        "location": location,
    }

    with DAG(f"{parent_dag_name}.{child_dag_name}", default_args=default_args) as dag:
        # https://cloud.google.com/composer/docs/how-to/using/using-kubernetes-pod-operator#kubernetespodoperator_configuration
        # https://medium.com/google-cloud/scale-your-kubernetes-cluster-to-almost-zero-with-gke-autoscaler-9c78051cbf40
        # https://docs.openshift.com/container-platform/3.6/admin_guide/scheduling/pod_affinity.html
        # https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/
        # https://cloud.google.com/composer/docs/how-to/using/using-kubernetes-pod-operator
        # https://airflow.apache.org/docs/stable/_api/airflow/contrib/operators/kubernetes_pod_operator/index.html
        create_gke_cluster = GKEClusterCreateOperator(
            task_id="create_gke_cluster",
            body=create_gke_config(
                name=cluster_name,
                service_account=service_account,
                owner_label=owner_label,
                team_label=team_label,
                machine_type=machine_type,
                location=location,
                # DataProc clusters require VPC with auto-created subnets
                subnetwork="default" if server_id == "admin" else "gke-subnet",
                is_dev=environ.get("DEPLOY_ENVIRONMENT") == "dev",
            ),
            dag=dag,
            **shared_config,
        )

        # Running the pod without any time in-between will cause the scope-based
        # authentication in Google Cloud Platform to fail. For example:
        #
        # `ServiceException: 401 Anonymous caller does not have
        # storage.objects.get access to moz-fx-prio-dev-a-private/processed/`
        #
        # Sleeping by a small amount solves this problem. This issue was first
        # noticed intermittently on 2019-09-09.
        sleep = BashOperator(task_id="sleep", bash_command="sleep 60", dag=dag)

        run_prio = GKEPodOperator(
            task_id=f"processor_{server_id}",
            name=f"processor_{server_id}",
            cluster_name=cluster_name,
            namespace="default",
            image=image,
            arguments=arguments,
            env_vars=env_vars,
            dag=dag,
            # choose the autoscaling node-pool for any jobs
            node_selectors={"node-label": "burstable"},
            labels={"pod-label": "burstable-pod"},
            affinity={
                "podAntiAffinity": {
                    "requiredDuringSchedulingIgnoredDuringExecution": [
                        {
                            "labelSelector": {
                                "matchExpressions": [
                                    {
                                        "key": "pod-label",
                                        "operator": "In",
                                        "values": ["burstable-pod"],
                                    }
                                ]
                            },
                            "topologyKey": "kubernetes.io/hostname",
                        }
                    ]
                }
            },
            # tolerate the tainted node
            tolerations=[
                {
                    "key": "reserved-pool",
                    "operator": "Equal",
                    "value": "true",
                    "effect": "NoSchedule",
                }
            ],
            # A new VM instance may take more than 120 seconds to boot
            startup_timeout_seconds=240,
            # delete the pod after running
            is_delete_operator_pod=True,
            **shared_config,
            **kwargs,
        )

        delete_gke_cluster = GKEClusterDeleteOperator(
            task_id="delete_gke_cluster",
            name=cluster_name,
            trigger_rule="all_done",
            dag=dag,
            **shared_config,
        )

        create_gke_cluster >> sleep >> run_prio >> delete_gke_cluster
        return dag
Пример #12
0
def export(
        leanplum_app_id,
        leanplum_client_key,
        bq_dataset_id,
        task_id,
        bq_project,
        gcs_bucket="moz-fx-data-prod-external-data",
        table_prefix=None,
        gcs_prefix=None,
        project_id=None,
        gcp_conn_id="google_cloud_derived_datasets",
        gke_location="us-central1-a",
        gke_cluster_name="bq-load-gke-1",
        gke_namespace="default",
        docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/leanplum-data-export:latest",
        **kwargs):
    """ Export a day of data from Leanplum for a single application,
        and make it available in BigQuery.

    See bug 1588654 for information on which buckets and datasets
    these tabes should live in.

    :param str leanplum_app_id:      [Required] Leanplum application ID
    :param str leanplum_client_key:  [Required] Leanplum client key
    :param str bq_dataset:           [Required] BigQuery default dataset id
    :param str task_id:              [Required] The task ID for this task
    :param str bq_project:           [Required] The project to create tables in
    :param str gcs_bucket:           GCS Bucket to export data to
    :param str gcs_prefix:           Prefix for data exported to GCS
    :param str project_id:           Project the GKE cluster is in
    :param str gcp_conn_id:          Airflow connection id for GCP access
    :param str gke_location:         GKE cluster location
    :param str gke_cluster_name:     GKE cluster name
    :param str gke_namespace:        GKE cluster namespace
    :param str docker_image:         docker image to use
    :param Dict[str, Any] kwargs:    Additional keyword arguments for
                                     GKEPodOperator

    :return: GKEPodOperator
    """
    kwargs["name"] = kwargs.get("name", task_id.replace("_", "-"))

    if project_id is None:
        project_id = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id

    args = [
        "leanplum-data-export", "export-leanplum", "--app-id", leanplum_app_id,
        "--client-key", leanplum_client_key, "--date", "{{ ds_nodash }}",
        "--bucket", gcs_bucket, "--bq-dataset", bq_dataset_id, "--project",
        bq_project
    ]

    if gcs_prefix is not None:
        args += ["--prefix", gcs_prefix]

    if table_prefix is not None:
        args += ["--table-prefix", table_prefix]

    return GKEPodOperator(task_id=task_id,
                          gcp_conn_id=gcp_conn_id,
                          project_id=project_id,
                          location=gke_location,
                          cluster_name=gke_cluster_name,
                          namespace=gke_namespace,
                          image=docker_image,
                          arguments=args,
                          **kwargs)
Пример #13
0
def spark_subdag(
    parent_dag_name,
    child_dag_name,
    default_args,
    gcp_conn_id,
    service_account,
    main,
    pyfiles,
    arguments,
    bootstrap_bucket,
    dataproc_region="us-west1",
    num_preemptible_workers=10,
):
    """Run the PySpark job for unnesting and range-partitioning Prio pings from
    the ingestion service.

    :param str parent_dag_name:         Name of the parent DAG.
    :param str child_dag_name:          Name of the child DAG.
    :param Dict[str, Any] default_args: Default arguments for the child DAG.
    :param str gcp_conn_id:             Name of the connection string.
    :param str service_account:         The address of the service account.
    :param str dataproc_region:           The region of the DataProc cluster.
    :param str main:
    :param List[str] pyfiles:
    :param List[str] arguments:
    :param int num_preemptible_workers: The number of preemptible workers.
    :return: DAG
    """

    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

    shared_config = {
        "cluster_name": "prio-staging-{{ds_nodash}}",
        "gcp_conn_id": gcp_conn_id,
        "project_id": connection.project_id,
        # From an error when not specifying the region:
        # - Dataproc images 2.0 and higher do not support the to-be
        #   deprecated global region. Please use any non-global Dataproc
        #   region instead
        #  - Must specify a zone in GCE configuration when using
        #    'regions/global'. To use auto zone placement, specify
        #    regions/<non-global-region> in request path, e.g.
        #    regions/us-central1
        "region": dataproc_region,
    }

    with DAG(f"{parent_dag_name}.{child_dag_name}",
             default_args=default_args) as dag:
        create_dataproc_cluster = DataprocClusterCreateOperator(
            task_id="create_dataproc_cluster",
            image_version="preview-ubuntu18",
            service_account=service_account,
            master_machine_type="n1-standard-4",
            worker_machine_type="n1-standard-4",
            num_workers=2,
            num_preemptible_workers=num_preemptible_workers,
            init_actions_uris=[
                f"{bootstrap_bucket}/install-python-requirements.sh"
            ],
            idle_delete_ttl=600,
            dag=dag,
            **shared_config,
        )

        run_dataproc_spark = DataProcPySparkOperator(
            task_id="run_dataproc_spark",
            main=main,
            dataproc_pyspark_jars=[
                "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"
            ],
            pyfiles=pyfiles,
            arguments=arguments,
            dag=dag,
            **shared_config,
        )

        delete_dataproc_cluster = DataprocClusterDeleteOperator(
            task_id="delete_dataproc_cluster",
            trigger_rule="all_done",
            dag=dag,
            **shared_config,
        )
        create_dataproc_cluster >> run_dataproc_spark >> delete_dataproc_cluster
        return dag
AUTOML_DATASET = models.Variable.get('automl_dataset')
AUTOML_MODEL = models.Variable.get('automl_model')
AUTOML_TRAINING_BUDGET = int(models.Variable.get('automl_training_budget'))

#[START dag_build_train_deploy]
default_dag_args = {
    'start_date': datetime.datetime(2050, 1, 1),
    'schedule_internal': None,
    'provide_context': True
}

dag = models.DAG('build_train_deploy', default_args=default_dag_args)
#[END dag_build_train_deploy]

# instantiate Google Cloud base hook to get credentials and create automl clients
gcp_hook = GoogleCloudBaseHook(conn_id='google_cloud_default')
automl_client = AutoMlClient(credentials=gcp_hook._get_credentials())

# Loads the database dump from Cloud Storage to BigQuery
t1 = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
    task_id="db_dump_to_bigquery",
    bucket=COMPOSER_BUCKET_NAME,
    source_objects=[DB_DUMP_FILENAME],
    schema_object="schema_source.json",
    source_format="CSV",
    skip_leading_rows=1,
    destination_project_dataset_table="{}.{}.{}".format(
        PROJECT, DATASET, 'data_source'),
    create_disposition="CREATE_IF_NEEDED",
    write_disposition="WRITE_TRUNCATE",
    dag=dag)
Пример #15
0
def bigquery_etl_query(destination_table,
                       dataset_id,
                       parameters=(),
                       arguments=(),
                       project_id=None,
                       sql_file_path=None,
                       gcp_conn_id="google_cloud_derived_datasets",
                       gke_location="us-central1-a",
                       gke_cluster_name="bq-load-gke-1",
                       gke_namespace="default",
                       docker_image="mozilla/bigquery-etl:latest",
                       date_partition_parameter="submission_date",
                       multipart=False,
                       allow_field_addition_on_date=None,
                       **kwargs):
    """ Generate.

    :param str destination_table:                  [Required] BigQuery destination table
    :param str dataset_id:                         [Required] BigQuery default dataset id
    :param Tuple[str] parameters:                  Parameters passed to bq query
    :param Tuple[str] arguments:                   Additional bq query arguments
    :param Optional[str] project_id:               BigQuery default project id
    :param Optional[str] sql_file_path:            Optional override for path to the
                                                   SQL query file to run
    :param str gcp_conn_id:                        Airflow connection id for GCP access
    :param str gke_location:                       GKE cluster location
    :param str gke_cluster_name:                   GKE cluster name
    :param str gke_namespace:                      GKE cluster namespace
    :param str docker_image:                       docker image to use
    :param Optional[str] date_partition_parameter: Parameter for indicating destination
                                                   partition to generate, if None
                                                   destination should be whole table
                                                   rather than partition
    :param Dict[str, Any] kwargs:                  Additional keyword arguments for
                                                   GKEPodOperator
    :param Optional[str] allow_field_addition_on_date: Optional {{ds}} value that
                                                   should be run with ALLOW_FIELD_ADDITION

    :return: GKEPodOperator
    """
    kwargs["task_id"] = kwargs.get("task_id", destination_table)
    kwargs["name"] = kwargs.get("name", kwargs["task_id"].replace("_", "-"))
    if not project_id:
        project_id = "moz-fx-data-shared-prod"
    sql_file_path = sql_file_path or "sql/{}/{}/{}/query.sql".format(
        project_id, dataset_id, destination_table)
    if destination_table is not None and date_partition_parameter is not None:
        destination_table = destination_table + "${{ds_nodash}}"
        parameters += (date_partition_parameter + ":DATE:{{ds}}", )
    return GKEPodOperator(
        gcp_conn_id=gcp_conn_id,
        project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
        location=gke_location,
        cluster_name=gke_cluster_name,
        namespace=gke_namespace,
        image=docker_image,
        arguments=["script/run_multipart_query" if multipart else "query"] +
        (["--destination_table=" +
          destination_table] if destination_table else []) +
        ["--dataset_id=" + dataset_id] +
        (["--project_id=" + project_id] if project_id else []) +
        ["--parameter=" + parameter for parameter in parameters] + (
            # Date comparisons for field additions need to happen within the parameter.
            # Template substitution occurs only within the operator with `arguments` being
            # one of the options of GKEPodOperator that allows templated arguments.
            # See also: https://github.com/mozilla/telemetry-airflow/pull/1174#discussion_r517505678
            [
                "--schema_update_option=" +
                "{{ 'ALLOW_FIELD_ADDITION' if ds == %r else '' }}" %
                allow_field_addition_on_date
            ] if allow_field_addition_on_date else []) + list(arguments) +
        [sql_file_path],
        **kwargs)
        dataset_id="telemetry_derived",
        xcom_task_id=experiment_search_query_task_id,
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"])

    (copy_deduplicate_main_ping >>
     experiment_search_aggregates >>
     experiment_search_aggregates_live_generate_view >>
     experiment_search_aggregates_live_deploy_view)

    # Daily and last seen views on top of every Glean application.

    gcp_conn_id = "google_cloud_derived_datasets"
    baseline_etl_kwargs = dict(
        gcp_conn_id=gcp_conn_id,
        project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id,
        location="us-central1-a",
        cluster_name="bq-load-gke-1",
        namespace="default",
        image="mozilla/bigquery-etl:latest",
    )
    baseline_args = [
        "--project-id=moz-fx-data-shared-prod",
        "--date={{ ds }}",
        "--only=*_stable.baseline_v1"
    ]
    baseline_clients_daily = GKEPodOperator(
        task_id='baseline_clients_daily',
        name='baseline-clients-daily',
        arguments=["script/run_glean_baseline_clients_daily"] + baseline_args,
        **baseline_etl_kwargs
Пример #17
0
def container_subdag(
    parent_dag_name,
    child_dag_name,
    default_args,
    gcp_conn_id,
    service_account,
    server_id,
    env_vars={},
    arguments=[],
    machine_type="n1-standard-1",
    image="mozilla/prio-processor:latest",
    location="us-west1-b",
    owner_label="amiyaguchi",
    team_label="dataeng",
):
    """Run a command on an ephemeral container running the
    `mozilla/prio-processor:latest` image.

    :param str parent_dag_name:         Name of the parent DAG.
    :param str child_dag_name:          Name of the child DAG.
    :param Dict[str, Any] default_args: Default arguments for the child DAG.
    :param str gcp_conn_id:             Name of the connection string.
    :param str service_account:         The address of the service account.
    :param str server_id:               The identifier for the Prio processor
    :param Dict[str, str] env_vars:     Environment variables for configuring
                                        the processor.
    :param List[str] arguments:         The command to run after loading the
                                        image.
    :param str machine_type:            The machine type for running the image.
    :param str image:                   Dockerhub image
    :param str location:                The region of the GKE cluster.
    :param str owner_label:             Label for associating the owner
    :param str team_label:              Label for associating the team
    :return: DAG
    """
    assert server_id in ["a", "b", "admin"]

    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

    cluster_name = "gke-prio-{}".format(server_id)

    shared_config = {
        "project_id": connection.project_id,
        "gcp_conn_id": gcp_conn_id,
        "location": location,
    }

    with DAG("{}.{}".format(parent_dag_name, child_dag_name),
             default_args=default_args) as dag:
        create_gke_cluster = GKEClusterCreateOperator(
            task_id="create_gke_cluster",
            body=create_gke_config(
                name=cluster_name,
                service_account=service_account,
                owner_label=owner_label,
                team_label=team_label,
                machine_type=machine_type,
                # DataProc clusters require VPC with auto-created subnets
                subnetwork="default" if server_id == "admin" else "gke-subnet",
                is_dev=environ.get("DEPLOY_ENVIRONMENT") == "dev",
            ),
            dag=dag,
            **shared_config)

        # Running the pod without any time in-between will cause the scope-based
        # authentication in Google Cloud Platform to fail. For example:
        #
        # `ServiceException: 401 Anonymous caller does not have
        # storage.objects.get access to moz-fx-prio-dev-a-private/processed/`
        #
        # Sleeping by a small amount solves this problem. This issue was first
        # noticed intermittently on 2019-09-09.
        sleep = BashOperator(task_id="sleep", bash_command="sleep 60", dag=dag)

        run_prio = GKEPodOperator(task_id="processor_{}".format(server_id),
                                  name="run-prio-project-{}".format(server_id),
                                  cluster_name=cluster_name,
                                  namespace="default",
                                  image=image,
                                  arguments=arguments,
                                  env_vars=env_vars,
                                  dag=dag,
                                  **shared_config)

        delete_gke_cluster = GKEClusterDeleteOperator(
            task_id="delete_gke_cluster",
            name=cluster_name,
            trigger_rule="all_done",
            dag=dag,
            **shared_config)

        create_gke_cluster >> sleep >> run_prio >> delete_gke_cluster
        return dag
Пример #18
0
def load_to_bigquery(parent_dag_name=None,
                     default_args=None,
                     dataset_s3_bucket=None,
                     aws_conn_id=None,
                     dataset=None,
                     dataset_version=None,
                     gke_cluster_name=None,
                     date_submission_col='submission_date_s3',
                     ds_type='ds_nodash',
                     dag_name='load_to_bigquery',
                     gke_location='us-central1-a',
                     gke_namespace='default',
                     docker_image='docker.io/mozilla/parquet2bigquery:20191017', # noqa
                     reprocess=False,
                     p2b_concurrency='10',
                     p2b_resume=False,
                     p2b_table_alias=None,
                     objects_prefix=None,
                     spark_gs_dataset_location=None,
                     bigquery_dataset='telemetry',
                     dataset_gcs_bucket='moz-fx-data-derived-datasets-parquet',
                     gcp_conn_id='google_cloud_derived_datasets',
                     cluster_by=(),
                     drop=(),
                     rename={},
                     replace=()):

    """ Load Parquet data into BigQuery. Used with SubDagOperator.

    We use S3ToGoogleCloudStorageTransferOperator to create a GCS Transfer
    Service job to transfer the AWS S3 parquet data into a GCS Bucket.
    Once that is completed we launch a Kubernates pod on a existing GKE
    cluster using the GKEPodOperator.

    :param str parent_dag_name:            parent dag name
    :param dict default_args:              dag configuration
    :param str dataset_s3_bucket:          source S3 Bucket
    :param str dataset_gcs_bucket:         destination GCS Bucket
    :param str aws_conn_id:                airflow connection id for S3 access
    :param str gcp_conn_id:                airflow connection id for GCP access
    :param str dataset:                    dataset name
    :param str dataset_version:            dataset version
    :param str date_submission_col:        dataset date submission column
    :param str ds_type:                    dataset format (ds or ds_nodash)
    :param str gke_location:               GKE cluster zone
    :param str gke_namespace:              GKE cluster namespace
    :param str docker_image:               docker image to use for GKE pod operations # noqa
    :param str bigquery_dataset:           bigquery load destination dataset
    :param str p2b_concurrency:            number of processes for parquet2bigquery load
    :param str p2b_table_alias:            override p2b table name with alias
    :param str p2b_resume                  allow resume support. defaults to False
    :param bool reprocess:                 enable dataset reprocessing defaults to False
    :param str objects_prefix:             custom objects_prefix to override defaults
    :param str spark_gs_dataset_location:  custom spark dataset load location to override defaults
    :param List[str] cluster_by:           top level fields to cluster by when creating destination table
    :param List[str] drop:                 top level fields to exclude from destination table
    :param Dict[str, str] rename:          top level fields to rename in destination table
    :param List[str] replace:              top level field replacement expressions

    :return airflow.models.DAG
    """

    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

    _dag_name = '{}.{}'.format(parent_dag_name, dag_name)

    if objects_prefix:
        _objects_prefix = objects_prefix
    else:
        _objects_prefix = '{}/{}/{}={{{{{}}}}}'.format(dataset,
                                                       dataset_version,
                                                       date_submission_col,
                                                       ds_type)
    gcs_buckets = {
        'transfer': dataset_gcs_bucket,
        'load': dataset_gcs_bucket,
    }

    gcstj_object_conditions = {
        'includePrefixes': _objects_prefix
    }

    gcstj_transfer_options = {
        'deleteObjectsUniqueInSink': True
    }

    gke_args = [
        '-d', bigquery_dataset,
        '-c', p2b_concurrency,
        '-b', gcs_buckets['load'],
        ]

    if not p2b_resume:
        gke_args += ['-R']

    if p2b_table_alias:
        gke_args += ['-a', p2b_table_alias]

    if reprocess:
        reprocess_objects_prefix = _objects_prefix.replace('_nodash', '')
        gcs_buckets['transfer'] += '-tmp'
        gke_args += ['-p', reprocess_objects_prefix]

    else:
        gke_args += ['-p', _objects_prefix]

    if cluster_by:
        gke_args += ['--cluster-by'] + cluster_by

    if drop:
        gke_args += ['--drop'] + drop

    if rename:
        gke_args += ['--rename'] + [k + "=" + v for k, v in rename.items()]

    if replace:
        gke_args += ['--replace'] + replace

    bq_table_name = p2b_table_alias or normalize_table_id('_'.join([dataset,
                                                                   dataset_version]))

    with models.DAG(_dag_name, default_args=default_args) as dag:
        if dataset_s3_bucket is not None:
            s3_to_gcs = S3ToGoogleCloudStorageTransferOperator(
                task_id='s3_to_gcs',
                s3_bucket=dataset_s3_bucket,
                gcs_bucket=gcs_buckets['transfer'],
                description=_objects_prefix,
                aws_conn_id=aws_conn_id,
                gcp_conn_id=gcp_conn_id,
                project_id=connection.project_id,
                object_conditions=gcstj_object_conditions,
                transfer_options=gcstj_transfer_options,
            )
        else:
            s3_to_gcs = DummyOperator(task_id='no_s3_to_gcs')

        reprocess = SubDagOperator(
            subdag=reprocess_parquet(
                _dag_name,
                default_args,
                reprocess,
                gcp_conn_id,
                gcs_buckets,
                _objects_prefix,
                date_submission_col,
                dataset,
                dataset_version,
                gs_dataset_location=spark_gs_dataset_location),
            task_id='reprocess_parquet')

        remove_bq_table = BigQueryTableDeleteOperator(
            task_id='remove_bq_table',
            bigquery_conn_id=gcp_conn_id,
            deletion_dataset_table='{}.{}${{{{ds_nodash}}}}'.format(bigquery_dataset, bq_table_name), # noqa
            ignore_if_missing=True,
        )

        bulk_load = GKEPodOperator(
            task_id='bigquery_load',
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            location=gke_location,
            cluster_name=gke_cluster_name,
            name=_dag_name.replace('_', '-'),
            namespace=gke_namespace,
            image=docker_image,
            arguments=gke_args,
            )

        s3_to_gcs >> reprocess >> remove_bq_table >> bulk_load

        return dag
Пример #19
0
def export_to_amplitude(
        parent_dag_name,
        dag_name,
        default_args,
        project,
        dataset,
        table_or_view,
        s3_prefix,
        gcs_bucket='moz-fx-data-derived-datasets-amplitude-export',
        gcp_conn_id='google_cloud_derived_datasets',
        amplitude_s3_conn='amplitude_s3_conn',
        amplitude_s3_bucket='com-amplitude-vacuum-mozilla-vacuum-wup'):

    """Export a bigquery table or view to Amplitude.

    This uses the BigQueryToCloudStorage operator to export the
    partition to GCS, then pushes that data to S3. It operates
    on a temporary table that is dropped after the job is finished.

    :param str parent_dag_name: Parent dag name
    :param str dag_name: This dag's name (appended to parent_dag_name)
    :param str default_args: DAG configuration
    :param str dataset: BigQuery project containing the table to be exported
    :param str dataset: BigQuery dataset
    :param str table_or_view: Table or view name
    :param str gcs_bucket: The bucket the data will be exported to
    :param str gcp_conn_id: GCP connection ID
    :param str amplitude_s3_conn: S3 connection ID
    :param str amplitude_s3_bucket: The bucket to export data to
    :param str s3_prefix: The prefix for the s3 objects
    """

    environment = environ['DEPLOY_ENVIRONMENT']
    _dag_name = '{}.{}'.format(parent_dag_name, dag_name)

    with models.DAG(_dag_name, default_args=default_args) as dag:
        # For now, we assume the view is already updated
        # See https://github.com/mozilla/bigquery-etl/issues/218

        exec_date = '{{ ds }}'

        # Check that we have data for this date
        check_sql = (
            'SELECT COUNT(*) '
            'FROM `{}.{}.{}` '
            'WHERE DATE(submission_timestamp) = "{}"'
        ).format(project, dataset, table_or_view, exec_date)

        wait_for_data = BigQuerySQLSensorOperator(
                task_id='wait_for_data',
                sql=check_sql,
                bigquery_conn_id=gcp_conn_id,
                use_legacy_sql=False
        )

        # Create the table with yesterday's data
        project_id = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id
        temp_table_name = table_or_view + '_{{ ds_nodash }}'
        fully_qualified_table_name = '{}.{}.{}'.format(project_id, dataset, temp_table_name)

        sql = (
            'SELECT * EXCEPT (submission_timestamp) '
            'FROM `{}.{}.{}` '
            'WHERE DATE(submission_timestamp) = "{}"'
        ).format(project, dataset, table_or_view, exec_date)

        create_table = BigQueryOperator(
            task_id='create_temporary_table',
            sql=sql,
            destination_dataset_table=fully_qualified_table_name,
            bigquery_conn_id=gcp_conn_id,
            use_legacy_sql=False
        )

        directory = '/'.join((environment, s3_prefix, '{{ ds_nodash }}'))
        extension = '.tsv.gz'

        # Export from bq to gcs
        # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/bigquery_to_gcs.py#L28 # noqa: E501
        gcs_uri = 'gs://{}/{}/*{}'.format(gcs_bucket, directory, extension)
        table_extract = BigQueryToCloudStorageOperator(
            task_id='bq_to_gcs',
            source_project_dataset_table=fully_qualified_table_name,
            destination_cloud_storage_uris=[gcs_uri],
            bigquery_conn_id=gcp_conn_id,
            compression='GZIP',
            export_format='CSV',
            field_delimiter='\t',
            print_header=True
        )

        # Push the data to S3
        # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/gcs_to_s3.py#L29 # noqa: E501
        s3_push = GoogleCloudStorageToS3Operator(
            task_id='gcs_to_s3',
            bucket=gcs_bucket,
            prefix=directory,
            delimiter=extension,
            google_cloud_storage_conn_id=gcp_conn_id,
            dest_aws_conn_id=amplitude_s3_conn,
            dest_s3_key='s3://{}/'.format(amplitude_s3_bucket),
            replace=True
        )

        # Drop the temporary table
        table_drop = BigQueryOperator(
            task_id='drop_temp_table',
            sql='DROP TABLE `{}`'.format(fully_qualified_table_name),
            bigquery_conn_id=gcp_conn_id,
            use_legacy_sql=False
        )

        # Delete the GCS data
        data_delete = GoogleCloudStorageDeleteOperator(
            task_id='delete_gcs_data',
            bucket_name=gcs_bucket,
            prefix=directory,
            gcp_conn_id=gcp_conn_id
        )

        wait_for_data >> create_table >> table_extract >> s3_push
        s3_push >> table_drop
        s3_push >> data_delete

        return dag
Пример #20
0
def reprocess_parquet(parent_dag_name,
                      default_args,
                      reprocess,
                      gcp_conn_id,
                      gcs_buckets,
                      objects_prefix,
                      date_submission_col,
                      dataset,
                      dataset_version,
                      gs_dataset_location=None,
                      dataproc_zone='us-central1-a',
                      dag_name='reprocess_parquet',
                      num_preemptible_workers=10):

    """ Reprocess Parquet datasets to conform with BigQuery Parquet loader.

    This function should be invoked as part of `load_to_bigquery`.

    https://github.com/mozilla-services/spark-parquet-to-bigquery/blob/master/src/main/scala/com/mozilla/dataops/spark/TransformParquet.scala ## noqa

    :param str parent_dag_name:            parent dag name
    :param dict default_args:              dag configuration
    :param str gcp_conn_id:                airflow connection id for GCP access
    :param dict gcp_buckets:               source and dest gcp buckets for reprocess
    :param str dataset:                    dataset name
    :param str dataset_version:            dataset version
    :param str object_prefix               objects location
    :param str date_submission_col:        dataset date submission column
    :param str dataproc_zone:              GCP zone to launch dataproc clusters
    :param str dag_name:                   name of dag
    :param int num_preemptible_workers:    number of dataproc cluster workers to provision
    :param bool reprocess:                 enable dataset reprocessing. defaults to False
    :param str gs_dataset_location:        override source location, defaults to None

    :return airflow.models.DAG
    """

    JAR = [
        'gs://moz-fx-data-derived-datasets-parquet-tmp/jars/spark-parquet-to-bigquery-assembly-1.0.jar' # noqa
    ]

    if gs_dataset_location:
        _gs_dataset_location = gs_dataset_location
    else:
        _gs_dataset_location = 'gs://{}/{}'.format(gcs_buckets['transfer'],
                                                   objects_prefix)

    cluster_name = '{}-{}'.format(dataset.replace('_', '-'),
                                  dataset_version) + '-{{ ds_nodash }}'

    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

    spark_args = [
        '--files', _gs_dataset_location,
        '--submission-date-col', date_submission_col,
        '--gcp-project-id', connection.project_id,
        '--gcs-bucket', 'gs://{}'.format(gcs_buckets['load']),
    ]

    _dag_name = '%s.%s' % (parent_dag_name, dag_name)

    with models.DAG(
            _dag_name,
            default_args=default_args) as dag:

        if reprocess:
            create_dataproc_cluster = DataprocClusterCreateOperator(
                task_id='create_dataproc_cluster',
                cluster_name=cluster_name,
                gcp_conn_id=gcp_conn_id,
                project_id=connection.project_id,
                num_workers=2,
                image_version='1.3',
                storage_bucket=gcs_buckets['transfer'],
                zone=dataproc_zone,
                master_machine_type='n1-standard-8',
                worker_machine_type='n1-standard-8',
                num_preemptible_workers=num_preemptible_workers,
                metadata={
                    'gcs-connector-version': '1.9.6',
                    'bigquery-connector-version': '0.13.6'
                    })

            run_dataproc_spark = DataProcSparkOperator(
                task_id='run_dataproc_spark',
                cluster_name=cluster_name,
                dataproc_spark_jars=JAR,
                main_class='com.mozilla.dataops.spark.TransformParquet',
                arguments=spark_args,
                gcp_conn_id=gcp_conn_id)

            delete_dataproc_cluster = DataprocClusterDeleteOperator(
                task_id='delete_dataproc_cluster',
                cluster_name=cluster_name,
                gcp_conn_id=gcp_conn_id,
                project_id=connection.project_id,
                trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

            create_dataproc_cluster >> run_dataproc_spark >> delete_dataproc_cluster # noqa

        else:
            DummyOperator(task_id='no_reprocess')

        return dag
Пример #21
0
    # this adds the error log url at the end of the msg
    slack_msg = task_msg + """ (<{log_url}|log>)""".format(
        log_url=context.get('task_instance').log_url, )
    failed_alert = SlackWebhookOperator(
        task_id='slack_test',
        http_conn_id='slack',
        webhook_token=slack_webhook_token,
        message=slack_msg,
        username='******',
    )
    return failed_alert.execute(context=context)


#to get credentials to access google sheets
wys_api_hook = GoogleCloudBaseHook('vz_api_google')
cred = wys_api_hook._get_credentials()
service = build('sheets', 'v4', credentials=cred, cache_discovery=False)

#to connect to pgadmin bot
wys_postgres = PostgresHook("wys_bot")
connection = BaseHook.get_connection('wys_api_key')
api_key = connection.password

default_args = {
    'owner': 'rdumas',
    'depends_on_past': False,
    'start_date': datetime(2020, 4, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_success': False,
Пример #22
0
from airflow.hooks.base_hook import BaseHook
from airflow.contrib.operators import mlengine_operator
from airflow.contrib.operators import mlengine_operator_utils
from airflow.contrib.operators import dataflow_operator
from airflow.contrib.operators import gcs_to_bq
# TODO Add when Composer on v2.0 and more Hook
# from airflow.contrib.operators import gcs_list_operator
from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
from airflow.utils import trigger_rule

from google.cloud.automl_v1beta1 import AutoMlClient, PredictionServiceClient
from clv_automl import clv_automl

# instantiate Google Cloud base hook to get credentials and create automl clients
gcp_credentials = GoogleCloudBaseHook(
    conn_id='google_cloud_default')._get_credentials()
automl_client = AutoMlClient(credentials=gcp_credentials)
automl_predict_client = PredictionServiceClient(credentials=gcp_credentials)


def _get_project_id():
    """Get project ID from default GCP connection."""

    extras = BaseHook.get_connection('google_cloud_default').extra_dejson
    key = 'extra__google_cloud_platform__project'
    if key in extras:
        project_id = extras[key]
    else:
        raise ('Must configure project_id in google_cloud_default '
               'connection from Airflow Console')
    return project_id