Пример #1
0
    def test_hook_correct_region():
        with patch(HOOK) as mock_hook:
            dataproc_task = DataProcSparkOperator(task_id=TASK_ID,
                                                  region=GCP_REGION)

            dataproc_task.execute(None)
            mock_hook.return_value.submit.assert_called_once_with(
                mock.ANY, mock.ANY, GCP_REGION, mock.ANY)
    def test_hook_correct_region(self):
       with patch('airflow.contrib.operators.dataproc_operator.DataProcHook') as mock_hook:
            dataproc_task = DataProcSparkOperator(
                task_id=TASK_ID,
                region=REGION
            )

            dataproc_task.execute(None)
            mock_hook.return_value.submit.assert_called_once_with(mock.ANY, mock.ANY, REGION)
Пример #3
0
    def test_hook_correct_region(self):
        with patch('airflow.contrib.operators.dataproc_operator.DataProcHook'
                   ) as mock_hook:
            dataproc_task = DataProcSparkOperator(task_id=TASK_ID,
                                                  region=REGION)

            dataproc_task.execute(None)
            mock_hook.return_value.submit.assert_called_once_with(
                mock.ANY, mock.ANY, REGION)
    def test_hook_correct_region():
        with patch(HOOK) as mock_hook:
            dataproc_task = DataProcSparkOperator(
                task_id=TASK_ID,
                region=GCP_REGION
            )

            dataproc_task.execute(None)
            mock_hook.return_value.submit.assert_called_once_with(mock.ANY, mock.ANY,
                                                                  GCP_REGION, mock.ANY)
Пример #5
0
    def test_dataproc_job_id_is_set():
        with patch(HOOK) as mock_hook:
            dataproc_task = DataProcSparkOperator(
                task_id=TASK_ID
            )

            _assert_dataproc_job_id(mock_hook, dataproc_task)
Пример #6
0
def getSparkJobHandleWithSparkProperties(taskID, mainClass):
    return DataProcSparkOperator(task_id=taskID,
                                 region=REGION,
                                 main_class=mainClass,
                                 cluster_name=CLUSTER_NAME,
                                 dataproc_spark_jars=[MAIN_JAR, BIG_QUERY_JAR],
                                 dataproc_spark_properties=SPARK_PROPERTIES)
Пример #7
0
    def test_correct_job_definition(self, mock_hook, mock_uuid):
        # Expected job
        job_definition = deepcopy(DATAPROC_JOB_TO_SUBMIT)
        job_definition['job']['sparkJob'] = {'mainClass': 'main_class'}
        job_definition['job']['reference']['projectId'] = None
        job_definition['job']['reference']['jobId'] = DATAPROC_JOB_ID + "_test"

        # Prepare job using operator
        task = DataProcSparkOperator(task_id=TASK_ID,
                                     region=GCP_REGION,
                                     cluster_name=CLUSTER_NAME,
                                     job_name=DATAPROC_JOB_ID,
                                     labels=LABELS,
                                     main_class="main_class")

        task.execute(context=None)
        self.assertDictEqual(job_definition, task.job_template.job)
Пример #8
0
 create_dataproc_cluster = DataprocClusterCreateOperator(
   task_id = 'create_dataproc_cluster',
   project_id = project_id,
   region = 'us-west1',
   master_machine_type = 'n1-standard-2',
   worker_machine_type = 'n1-standard-2',
   num_workers = 2,
   cluster_name = '{{ ti.xcom_pull(key = "cluster_name", task_ids = "push_cluster_name") }}' #get the cluster name from xcom in template
 )
 run_collection_analysis_job = DataProcSparkOperator(
   task_id = 'start_collection_analysis_spark_job',
   main_class = 'com.makoto.spark.LoanAnalyze',
   dataproc_spark_jars = "gs://creditclub/CreditClub-assembly-0.1.jar",
   arguments = [     
     "input_load_stats_csv_path",
     "input_rejection_stats_csv_path",
     "output_path"    
   ],
   job_name = 'creditanalysis',
   region = 'us-west1',
   cluster_name = '{{ ti.xcom_pull(key = "cluster_name", task_ids = "push_cluster_name") }}'
 )
 delete_dataproc_cluster = DataprocClusterDeleteOperator(
   task_id = 'delete_dataproc_cluster',
   project_id = project_id,
   cluster_name = '{{ ti.xcom_pull(key = "cluster_name", task_ids = "push_cluster_name") }}',
   region = 'us-west1',
   trigger_rule = trigger_rule.TriggerRule.ALL_DONE
 )
 # dependency
 push_cluster_name_op >> create_dataproc_cluster >> run_collection_analysis_job >> delete_dataproc_cluster
Пример #9
0
        region='us-central1',
        master_machine_type='n1-standard-2',
        worker_machine_type='n1-standard-2',
        cluster_name=
        '{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}',
        num_workers=2)

    # The task of running the Spark job.
    dataproc_spark_process = DataProcSparkOperator(
        task_id='dataproc-test',
        dataproc_spark_jars=[
            'gs://lendingclub12/LendingClub-assembly-0.1.jar'
        ],
        main_class='p2p_data_analysis.spark.LoanDataAnalyzer',
        job_name='loan',
        region='us-central1',
        cluster_name=
        '{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}',
        arguments=[
            "gs://lendingclub12/LoanStats_2019Q1.csv",
            "gs://lendingclub12/RejectStats_2019Q1.csv",
            "gs://lendingclub12/output"
        ])

    # The task of deleting the cluster.
    dataproc_destroy_cluster = DataprocClusterDeleteOperator(
        task_id='dataproc-destroy-cluster',
        project_id='silicon-parity-282607',
        region='us-central1',
        cluster_name=
        '{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}',
Пример #10
0
        project_id='makoto0908spark',
        cluster_name=
        '{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}'
        + '4',
        region='us-west1',
        execution_timeout=timedelta(minutes=30),
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

    args = ["--process.date", "{{ (execution_date).strftime('%Y-%m-%d') }}"]

    unique_user = DataProcSparkOperator(
        task_id='unique_user',
        dataproc_spark_jars=['gs://path/jar/CohortAnalysis.jar'],
        main_class='com.makoto.spark.process.UserGenerateProcess',
        region='us-west1',
        job_name=dag_name + 'unique_user',
        cluster_name=
        '{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}'
        + '1',
        execution_timeout=timedelta(minutes=180),
        arguments=args)

    args = ["--process.date", "{{ (execution_date).strftime('%Y-%m-%d') }}"]

    bike_share_aggregator = DataProcSparkOperator(
        task_id='bike_share_aggregator',
        dataproc_spark_jars=['gs://path/jar/CohortAnalysis.jar'],
        main_class='com.makoto.spark.process.BikeTripProcess',
        region='us-west1',
        job_name=dag_name + 'bike_share_aggregator',
        cluster_name=
Пример #11
0
def moz_dataproc_jar_runner(parent_dag_name=None,
                            dag_name='run_script_on_dataproc',
                            default_args=None,
                            cluster_name=None,
                            num_workers=2,
                            image_version='1.4',
                            zone='us-west1-b',
                            idle_delete_ttl='14400',
                            auto_delete_ttl='28800',
                            master_machine_type='n1-standard-8',
                            worker_machine_type='n1-standard-4',
                            num_preemptible_workers=0,
                            service_account=None,
                            init_actions_uris=None,
                            optional_components=['ANACONDA'],
                            install_component_gateway=True,
                            jar_urls=None,
                            main_class=None,
                            jar_args=None,
                            job_name=None,
                            aws_conn_id=None,
                            gcp_conn_id='google_cloud_airflow_dataproc'):
    """
    This will initially create a GCP Dataproc cluster with Anaconda/Jupyter/Component gateway.
    Then we call DataProcSparkOperator to execute the jar defined by the arguments
    jar_urls and main_class. Once that succeeds, we teardown the cluster.

    **Example**: ::

        # Unsalted cluster name so subsequent runs fail if the cluster name exists
        cluster_name = 'test-dataproc-cluster-hwoo'

        # Defined in Airflow's UI -> Admin -> Connections
        gcp_conn_id = 'google_cloud_airflow_dataproc'

        run_dataproc_jar = SubDagOperator(
            task_id='run_dataproc_jar',
            dag=dag,
            subdag = moz_dataproc_jar_runner(
                parent_dag_name=dag.dag_id,
                dag_name='run_dataproc_jar',
                job_name='Run_some_spark_jar_on_dataproc',
                default_args=default_args,
                cluster_name=cluster_name,
                jar_urls=['gs://some_bucket/some_jar.jar'],
                main_class='com.mozilla.path.to.ClassName',
                jar_args=["-d", "{{ ds_nodash }}"],
                gcp_conn_id=gcp_conn_id)
        )

    Airflow related args:
    ---
    See moz_dataproc_pyspark_runner

    Dataproc Cluster related args:
    ---
    See moz_dataproc_pyspark_runner

    Jar runner related args:
    ---
    :param list jar_urls:               URIs to jars provisioned in Cloud Storage (example:
                                        for UDFs and libs) and are ideal to put in default arguments.
    :param str main_class:              Name of the job class entrypoint to execute.
    :param list jar_args:               Arguments for the job.

    """

    if cluster_name is None or jar_urls is None or main_class is None:
        raise AirflowException(
            'Please specify cluster_name, jar_urls, and/or main_class.')

    dataproc_helper = DataProcHelper(
        cluster_name=cluster_name,
        num_workers=num_workers,
        image_version=image_version,
        zone=zone,
        idle_delete_ttl=idle_delete_ttl,
        auto_delete_ttl=auto_delete_ttl,
        master_machine_type=master_machine_type,
        worker_machine_type=worker_machine_type,
        num_preemptible_workers=num_preemptible_workers,
        service_account=service_account,
        init_actions_uris=init_actions_uris,
        optional_components=optional_components,
        install_component_gateway=install_component_gateway,
        aws_conn_id=aws_conn_id,
        gcp_conn_id=gcp_conn_id)

    _dag_name = '{}.{}'.format(parent_dag_name, dag_name)

    with models.DAG(_dag_name, default_args=default_args) as dag:
        create_dataproc_cluster = dataproc_helper.create_cluster()

        # Note - When we upgrade to a later version of Airflow that pulls in latest
        # DataProcSparkOperator code, use the argument main_jar=jar_url instead, and
        # remove arguments main_class and dataproc_spark_jars.
        run_jar_on_dataproc = DataProcSparkOperator(
            cluster_name=cluster_name,
            task_id='run_jar_on_dataproc',
            job_name=job_name,
            dataproc_spark_jars=jar_urls,
            main_class=main_class,
            arguments=jar_args,
            gcp_conn_id=gcp_conn_id)

        delete_dataproc_cluster = dataproc_helper.delete_cluster()

        create_dataproc_cluster >> run_jar_on_dataproc >> delete_dataproc_cluster
        return dag
Пример #12
0
        project_id='sinuous-set-242504',
        region='us-west1',
        master_machine_type='n1-standard-2',
        worker_machine_type='n1-standard-2',
        cluster_name=
        '{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}',
        num_workers=2)

    dataproc_spark_process = DataProcSparkOperator(
        task_id='dataproc-test',
        dataproc_spark_jars=[
            'gs://jiuzhangsuanfa/SparkProject-assembly-0.1.jar'
        ],
        main_class='com.jiuzhang.spark.LoanAnalyze',
        job_name='loan',
        region='us-west1',
        cluster_name=
        '{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}',
        arguments=[
            "gs://jiuzhangsuanfa/LendingClub/LoanStats_2019Q1.csv",
            "gs://jiuzhangsuanfa/LendingClub/RejectStats_2019Q1.csv",
            "gs://jiuzhangsuanfa/output"
        ])

    dataproc_destroy_cluster = DataprocClusterDeleteOperator(
        task_id='dataproc-destroy-cluster',
        project_id='sinuous-set-242504',
        region='us-west1',
        cluster_name=
        '{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}',
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)
Пример #13
0
def getSparkJobHandle(taskID, mainClass):
    return DataProcSparkOperator(task_id=taskID,
                                 region=REGION,
                                 main_class=mainClass,
                                 cluster_name=CLUSTER_NAME,
                                 dataproc_spark_jars=[MAIN_JAR, BIG_QUERY_JAR])
Пример #14
0
      task_id='dataproc_destroy_cluster_4',
      project_id='sinuous-set-242504',
      cluster_name='{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}'
      + '4',
      region='us-west1',
      execution_timeout=timedelta(minutes=30),
      trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

  args = ["--process.date", "{{ (execution_date).strftime('%Y-%m-%d') }}"]

  unique_user = DataProcSparkOperator(
      task_id='unique_user',
      dataproc_spark_jars=[
          'gs://jiuzhangsuanfa/jar/CohortProject-assembly-0.1.jar'
      ],
      main_class='com.cohort.process.UserProcess',
      region='us-west1',
      job_name=dag_name + 'unique_user',
      cluster_name='{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}'
      + '1',
      execution_timeout=timedelta(minutes=180),
      arguments=args)

  args = ["--process.date", "{{ (execution_date).strftime('%Y-%m-%d') }}"]

  bike_share_aggregator = DataProcSparkOperator(
      task_id='bike_share_aggregator',
      dataproc_spark_jars=[
          'gs://jiuzhangsuanfa/jar/CohortProject-assembly-0.1.jar'
      ],
      main_class='com.cohort.process.BikeShareProcess',
      region='us-west1',
Пример #15
0
        query="define sin HiveUDF('sin');",
        region=REGION,
        cluster_name=CLUSTER_NAME,
    )

    spark_sql_task = DataProcSparkSqlOperator(
        task_id="spark_sql_task",
        query="SHOW DATABASES;",
        region=REGION,
        cluster_name=CLUSTER_NAME,
    )

    spark_task = DataProcSparkOperator(
        task_id="spark_task",
        main_class="org.apache.spark.examples.SparkPi",
        dataproc_jars="file:///usr/lib/spark/examples/jars/spark-examples.jar",
        region=REGION,
        cluster_name=CLUSTER_NAME,
    )

    pyspark_task = DataProcPySparkOperator(
        task_id="pyspark_task",
        main=PYSPARK_URI,
        region=REGION,
        cluster_name=CLUSTER_NAME,
    )

    hive_task = DataProcHiveOperator(
        task_id="hive_task",
        query="SHOW DATABASES;",
        region=REGION,
        })

    # dpso_catalogos_generales = DataProcSparkOperator(
    #     task_id = 'dpso_catalogos_generales',
    #     cluster_name = CLUSTER_NAME,
    #     dataproc_spark_properties = SPARK_SUBMIT_PROPERTIES,
    #     dataproc_spark_jars = [BUCKET_DATAPROC_SPARK_JARS + COMPONENT_CATALOGOS_GENERALES + '.jar'],
    #     main_class = NAMESPACE_MAIN_CLASS + '.' + COMPONENT_CATALOGOS_GENERALES,
    #     arguments = [PLATFORM, ENVIRONMENT_GCP, SIS_ORI_INFO]
    # )

    dpso_vdg_polizas_certificado = DataProcSparkOperator(
        task_id='dpso_vdg_polizas_certificado',
        cluster_name=CLUSTER_NAME,
        dataproc_spark_properties=SPARK_SUBMIT_PROPERTIES,
        dataproc_spark_jars=[
            BUCKET_DATAPROC_SPARK_JARS + COMPONENT_POLIZAS_CERTIFICADO + '.jar'
        ],
        main_class=NAMESPACE_MAIN_CLASS + '.' + COMPONENT_POLIZAS_CERTIFICADO,
        arguments=[PLATFORM, ENVIRONMENT_GCP, SIS_ORI_INFO])

    dpso_vdg_siniestros = DataProcSparkOperator(
        task_id='dpso_vdg_siniestros',
        cluster_name=CLUSTER_NAME,
        dataproc_spark_properties=SPARK_SUBMIT_PROPERTIES,
        dataproc_spark_jars=[
            BUCKET_DATAPROC_SPARK_JARS + COMPONENT_SINIESTROS + '.jar'
        ],
        main_class=NAMESPACE_MAIN_CLASS + '.' + COMPONENT_SINIESTROS,
        arguments=[PLATFORM, ENVIRONMENT_GCP, SIS_ORI_INFO])
Пример #17
0
    dag=dag)

# =================
# == Spark Jobs ===
# =================

# define arguments
args = ["--args.for.jar", "ThisIsArgs"]

calc_unique_users = DataProcSparkOperator(
    task_id='calc_unique_users',
    dataproc_spark_jars=[
        'ggs://exampleBucket/jar/yourProject-assembly-0.1.jar'
    ],
    main_class='yourProject.com.ActionProcess',
    region='us-west1',
    job_name=cleaned_dag_id + 'calc_unique_users',
    cluster_name=
    '{{ ti.xcom_pull(key=unique_cluster_name, task_ids="generate_unique_cluster_name") }}'
    + '1',
    execution_timeout=timedelta(hours=2),
    arguments=args,
    dag=dag)

# define arguments
args = ["--args.for.jar", "ThisIsArgs"]

calc_agg = DataProcSparkOperator(
    task_id='calc_agg',
    dataproc_spark_jars=[
        'ggs://exampleBucket/jar/yourProject-assembly-0.1.jar'
    ],
Пример #18
0
destroy_spark_cluster = DataprocClusterDeleteOperator(
    task_id='destroy_spark_cluster',
    trigger_rule='all_done',
    execution_timeout=timedelta(minutes=10),
    cluster_name=gcpClusterName,
    project_id=gcpProjectName,
    region=gcpRegion,
    dag=dag)


# Ia ETL Facebook
etl_facebook = DataProcSparkOperator(
    task_id='etl_facebook',
    execution_timeout=timedelta(minutes=30),
    cluster_name=gcpClusterName,
    dataproc_spark_jars=[gcpJar],
    main_class='etl.Ia_Facebook',
    arguments=[gcpDataStorage + "/data/facebook-nl.json",
               gcpDataStorage + "/data/facebook-nl.parquet"],
    dag=dag)

# Ib ETL Factual
etl_factual = DataProcSparkOperator(
    task_id='etl_factual',
    execution_timeout=timedelta(minutes=30),
    cluster_name=gcpClusterName,
    dataproc_spark_jars=[gcpJar],
    main_class='etl.Ib_Factual',
    arguments=[gcpDataStorage + "/data/factual-nl.json",
               gcpDataStorage + "/data/factual-nl.parquet"],
    dag=dag)
Пример #19
0
def moz_dataproc_scriptrunner(parent_dag_name=None,
                              dag_name='run_script_on_dataproc',
                              default_args=None,
                              cluster_name=None,
                              num_workers=2,
                              image_version='1.4',
                              zone='us-west1-b',
                              idle_delete_ttl='14400',
                              auto_delete_ttl='28800',
                              master_machine_type='n1-standard-8',
                              worker_machine_type='n1-standard-4',
                              num_preemptible_workers=0,
                              service_account=None,
                              init_actions_uris=None,
                              optional_components=['ANACONDA'],
                              install_component_gateway=True,
                              uri=None,
                              env=None,
                              arguments=None,
                              job_name=None,
                              aws_conn_id=None,
                              gcp_conn_id='google_cloud_airflow_dataproc'):
    """
    This will initially create a GCP Dataproc cluster with Anaconda/Jupyter/Component gateway.
    Then we execute a script uri (either https or gcs) similar to how we use our custom AWS
    EmrSparkOperator. This will call DataProcSparkOperator using EMR's script-runner.jar, which
    then executes the airflow_gcp.sh entrypoint script. The entrypoint script expects another
    script uri, along with it's arguments, as parameters. Once that succeeds, we teardown the
    cluster.

    **Example**: ::

        # Unsalted cluster name so subsequent runs fail if the cluster name exists
        cluster_name = 'test-dataproc-cluster-hwoo'

        # Defined in Airflow's UI -> Admin -> Connections
        gcp_conn_id = 'google_cloud_airflow_dataproc'

        run_dataproc_script = SubDagOperator(
            task_id='run_dataproc_script',
            dag=dag,
            subdag = moz_dataproc_scriptrunner(
                parent_dag_name=dag.dag_id,
                dag_name='run_dataproc_script',
                default_args=default_args,
                cluster_name=cluster_name,
                job_name='Run_a_script_on_dataproc',
                uri='https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/some_bash_or_py_script.py',
                env={"date": "{{ ds_nodash }}"},
                arguments="-d {{ ds_nodash }}",
                gcp_conn_id=gcp_conn_id)
        )

    Airflow related args:
    ---
    See moz_dataproc_pyspark_runner

    Dataproc Cluster related args:
    ---
    See moz_dataproc_pyspark_runner

    Scriptrunner specific args:
    ---
    :param str uri:                     The HTTP or GCS URI of the script to run. Can be
                                        .py, .jar, or other type of script (e.g. bash). Is ran
                                        via the airflow_gcp.sh entrypoint. Ipynb is no longer
                                        supported.
    :param dict env:                    If env is not None, it must be a mapping that defines
                                        the environment variables for the new process 
                                        (templated).
    :param str arguments:               Passed to `airflow_gcp.sh`, passed as one long string 
                                        of space separated args.

    """

    if job_name is None or uri is None or cluster_name is None:
        raise AirflowException(
            'Please specify job_name, uri, and cluster_name.')

    dataproc_helper = DataProcHelper(
        cluster_name=cluster_name,
        num_workers=num_workers,
        image_version=image_version,
        zone=zone,
        idle_delete_ttl=idle_delete_ttl,
        auto_delete_ttl=auto_delete_ttl,
        master_machine_type=master_machine_type,
        worker_machine_type=worker_machine_type,
        num_preemptible_workers=num_preemptible_workers,
        service_account=service_account,
        init_actions_uris=init_actions_uris,
        optional_components=optional_components,
        install_component_gateway=install_component_gateway,
        aws_conn_id=aws_conn_id,
        gcp_conn_id=gcp_conn_id)

    _dag_name = '{}.{}'.format(parent_dag_name, dag_name)
    environment = _format_envvar(env)

    script_bucket = 'moz-fx-data-prod-airflow-dataproc-artifacts'
    jar_url = 'gs://{}/bin/script-runner.jar'.format(script_bucket)

    args = [
        'gs://{}/bootstrap/airflow_gcp.sh'.format(script_bucket), '--job-name',
        job_name, '--uri', uri, '--environment', environment
    ]

    if arguments:
        args += ['--arguments', arguments]

    with models.DAG(_dag_name, default_args=default_args) as dag:
        create_dataproc_cluster = dataproc_helper.create_cluster()

        # Run DataprocSparkOperator with script-runner.jar pointing to airflow_gcp.sh.
        # Note - When we upgrade to a later version of Airflow that pulls in latest
        # DataProcSparkOperator code, use the argument main_jar=jar_url instead, and
        # remove arguments main_class and dataproc_spark_jars.
        run_script_on_dataproc = DataProcSparkOperator(
            cluster_name=cluster_name,
            task_id='run_script_on_dataproc',
            job_name=job_name,
            dataproc_spark_jars=[jar_url],
            main_class='com.amazon.elasticmapreduce.scriptrunner.ScriptRunner',
            arguments=args,
            gcp_conn_id=gcp_conn_id)

        delete_dataproc_cluster = dataproc_helper.delete_cluster()

        create_dataproc_cluster >> run_script_on_dataproc >> delete_dataproc_cluster
        return dag
Пример #20
0
}  # Dict mentioning Spark job's properties

DATAPROC_SPARK_JARS = ['gs://example-bucket/runnableJars/example-jar.jar']

date_tuple = dynamic_date(
    3
)  # Suppose we are processing 3 days ago's data - mimics a lag in arrival and processing of data

run_spark_job = DataProcSparkOperator(
    dag=dag,
    arguments=[
        "gs://example-source-bucket/year=" + date_tuple['year'] + "/month=" +
        date_tuple['month'] + "/day=" + date_tuple['day'] + "/*",
        "gs://example-sink-bucket/dir1/year=" + date_tuple['year'] +
        "/month=" + date_tuple['month'] + "/day=" + date_tuple['date'] + "/"
    ],
    region="us-central1",
    task_id='example-spark-job',
    dataproc_spark_jars=DATAPROC_SPARK_JARS,
    dataproc_spark_properties=DATAPROC_SPARK_PROP,
    cluster_name='example-{{ ds }}',
    main_class='[Path-to-Main-Class]',
)

load_to_bq = GoogleCloudStorageToBigQueryOperator(
    bucket="example-bucket",
    source_objects=[
        "gs://example-sink-bucket/dir1/year=" + date_tuple['year'] +
        "/month=" + date_tuple['month'] + "/day=" + date_tuple['date'] +
        "/*.parquet"
    ],
Пример #21
0
def reprocess_parquet(parent_dag_name,
                      default_args,
                      reprocess,
                      gcp_conn_id,
                      gcs_buckets,
                      objects_prefix,
                      date_submission_col,
                      dataset,
                      dataset_version,
                      gs_dataset_location=None,
                      dataproc_zone='us-central1-a',
                      dag_name='reprocess_parquet',
                      num_preemptible_workers=10):

    """ Reprocess Parquet datasets to conform with BigQuery Parquet loader.

    This function should be invoked as part of `load_to_bigquery`.

    https://github.com/mozilla-services/spark-parquet-to-bigquery/blob/master/src/main/scala/com/mozilla/dataops/spark/TransformParquet.scala ## noqa

    :param str parent_dag_name:            parent dag name
    :param dict default_args:              dag configuration
    :param str gcp_conn_id:                airflow connection id for GCP access
    :param dict gcp_buckets:               source and dest gcp buckets for reprocess
    :param str dataset:                    dataset name
    :param str dataset_version:            dataset version
    :param str object_prefix               objects location
    :param str date_submission_col:        dataset date submission column
    :param str dataproc_zone:              GCP zone to launch dataproc clusters
    :param str dag_name:                   name of dag
    :param int num_preemptible_workers:    number of dataproc cluster workers to provision
    :param bool reprocess:                 enable dataset reprocessing. defaults to False
    :param str gs_dataset_location:        override source location, defaults to None

    :return airflow.models.DAG
    """

    JAR = [
        'gs://moz-fx-data-derived-datasets-parquet-tmp/jars/spark-parquet-to-bigquery-assembly-1.0.jar' # noqa
    ]

    if gs_dataset_location:
        _gs_dataset_location = gs_dataset_location
    else:
        _gs_dataset_location = 'gs://{}/{}'.format(gcs_buckets['transfer'],
                                                   objects_prefix)

    cluster_name = '{}-{}'.format(dataset.replace('_', '-'),
                                  dataset_version) + '-{{ ds_nodash }}'

    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

    spark_args = [
        '--files', _gs_dataset_location,
        '--submission-date-col', date_submission_col,
        '--gcp-project-id', connection.project_id,
        '--gcs-bucket', 'gs://{}'.format(gcs_buckets['load']),
    ]

    _dag_name = '%s.%s' % (parent_dag_name, dag_name)

    with models.DAG(
            _dag_name,
            default_args=default_args) as dag:

        if reprocess:
            create_dataproc_cluster = DataprocClusterCreateOperator(
                task_id='create_dataproc_cluster',
                cluster_name=cluster_name,
                gcp_conn_id=gcp_conn_id,
                project_id=connection.project_id,
                num_workers=2,
                image_version='1.3',
                storage_bucket=gcs_buckets['transfer'],
                zone=dataproc_zone,
                master_machine_type='n1-standard-8',
                worker_machine_type='n1-standard-8',
                num_preemptible_workers=num_preemptible_workers,
                metadata={
                    'gcs-connector-version': '1.9.6',
                    'bigquery-connector-version': '0.13.6'
                    })

            run_dataproc_spark = DataProcSparkOperator(
                task_id='run_dataproc_spark',
                cluster_name=cluster_name,
                dataproc_spark_jars=JAR,
                main_class='com.mozilla.dataops.spark.TransformParquet',
                arguments=spark_args,
                gcp_conn_id=gcp_conn_id)

            delete_dataproc_cluster = DataprocClusterDeleteOperator(
                task_id='delete_dataproc_cluster',
                cluster_name=cluster_name,
                gcp_conn_id=gcp_conn_id,
                project_id=connection.project_id,
                trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

            create_dataproc_cluster >> run_dataproc_spark >> delete_dataproc_cluster # noqa

        else:
            DummyOperator(task_id='no_reprocess')

        return dag
Пример #22
0
    # Note: this operator will be flagged a success if the cluster by this name already exists.
    create_cluster = DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # ds_nodash is an airflow macro for "[Execution] Date string no dashes"
        # in YYYYMMDD format. See docs https://airflow.apache.org/code.html?highlight=macros#macros
        cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
        image_version='1.5-debian10',
        num_workers=2,
        storage_bucket=CLUSTER_STORAGE_BUCKET,
        region=REGION,
        zone=ZONE)

    # Submit our Spark Job
    submit_scalaspark = DataProcSparkOperator(
        task_id=TASK_ID,
        region=REGION,
        main_class=MAIN_CLASS,
        cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
        dataproc_spark_jars=MAIN_JAR)

    # Delete the Cloud Dataproc cluster.
    delete_cluster = DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        region=REGION,
        # Obviously needs to match the name of cluster created in the prior two Operators.
        cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
        # This will tear down the cluster even if there are failures in upstream tasks.
        trigger_rule=TriggerRule.ALL_DONE)

    create_cluster.dag = dag

    create_cluster.set_downstream(submit_scalaspark)