def test_hook_correct_region(): with patch(HOOK) as mock_hook: dataproc_task = DataProcSparkOperator(task_id=TASK_ID, region=GCP_REGION) dataproc_task.execute(None) mock_hook.return_value.submit.assert_called_once_with( mock.ANY, mock.ANY, GCP_REGION, mock.ANY)
def test_hook_correct_region(self): with patch('airflow.contrib.operators.dataproc_operator.DataProcHook') as mock_hook: dataproc_task = DataProcSparkOperator( task_id=TASK_ID, region=REGION ) dataproc_task.execute(None) mock_hook.return_value.submit.assert_called_once_with(mock.ANY, mock.ANY, REGION)
def test_hook_correct_region(self): with patch('airflow.contrib.operators.dataproc_operator.DataProcHook' ) as mock_hook: dataproc_task = DataProcSparkOperator(task_id=TASK_ID, region=REGION) dataproc_task.execute(None) mock_hook.return_value.submit.assert_called_once_with( mock.ANY, mock.ANY, REGION)
def test_hook_correct_region(): with patch(HOOK) as mock_hook: dataproc_task = DataProcSparkOperator( task_id=TASK_ID, region=GCP_REGION ) dataproc_task.execute(None) mock_hook.return_value.submit.assert_called_once_with(mock.ANY, mock.ANY, GCP_REGION, mock.ANY)
def test_dataproc_job_id_is_set(): with patch(HOOK) as mock_hook: dataproc_task = DataProcSparkOperator( task_id=TASK_ID ) _assert_dataproc_job_id(mock_hook, dataproc_task)
def getSparkJobHandleWithSparkProperties(taskID, mainClass): return DataProcSparkOperator(task_id=taskID, region=REGION, main_class=mainClass, cluster_name=CLUSTER_NAME, dataproc_spark_jars=[MAIN_JAR, BIG_QUERY_JAR], dataproc_spark_properties=SPARK_PROPERTIES)
def test_correct_job_definition(self, mock_hook, mock_uuid): # Expected job job_definition = deepcopy(DATAPROC_JOB_TO_SUBMIT) job_definition['job']['sparkJob'] = {'mainClass': 'main_class'} job_definition['job']['reference']['projectId'] = None job_definition['job']['reference']['jobId'] = DATAPROC_JOB_ID + "_test" # Prepare job using operator task = DataProcSparkOperator(task_id=TASK_ID, region=GCP_REGION, cluster_name=CLUSTER_NAME, job_name=DATAPROC_JOB_ID, labels=LABELS, main_class="main_class") task.execute(context=None) self.assertDictEqual(job_definition, task.job_template.job)
create_dataproc_cluster = DataprocClusterCreateOperator( task_id = 'create_dataproc_cluster', project_id = project_id, region = 'us-west1', master_machine_type = 'n1-standard-2', worker_machine_type = 'n1-standard-2', num_workers = 2, cluster_name = '{{ ti.xcom_pull(key = "cluster_name", task_ids = "push_cluster_name") }}' #get the cluster name from xcom in template ) run_collection_analysis_job = DataProcSparkOperator( task_id = 'start_collection_analysis_spark_job', main_class = 'com.makoto.spark.LoanAnalyze', dataproc_spark_jars = "gs://creditclub/CreditClub-assembly-0.1.jar", arguments = [ "input_load_stats_csv_path", "input_rejection_stats_csv_path", "output_path" ], job_name = 'creditanalysis', region = 'us-west1', cluster_name = '{{ ti.xcom_pull(key = "cluster_name", task_ids = "push_cluster_name") }}' ) delete_dataproc_cluster = DataprocClusterDeleteOperator( task_id = 'delete_dataproc_cluster', project_id = project_id, cluster_name = '{{ ti.xcom_pull(key = "cluster_name", task_ids = "push_cluster_name") }}', region = 'us-west1', trigger_rule = trigger_rule.TriggerRule.ALL_DONE ) # dependency push_cluster_name_op >> create_dataproc_cluster >> run_collection_analysis_job >> delete_dataproc_cluster
region='us-central1', master_machine_type='n1-standard-2', worker_machine_type='n1-standard-2', cluster_name= '{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}', num_workers=2) # The task of running the Spark job. dataproc_spark_process = DataProcSparkOperator( task_id='dataproc-test', dataproc_spark_jars=[ 'gs://lendingclub12/LendingClub-assembly-0.1.jar' ], main_class='p2p_data_analysis.spark.LoanDataAnalyzer', job_name='loan', region='us-central1', cluster_name= '{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}', arguments=[ "gs://lendingclub12/LoanStats_2019Q1.csv", "gs://lendingclub12/RejectStats_2019Q1.csv", "gs://lendingclub12/output" ]) # The task of deleting the cluster. dataproc_destroy_cluster = DataprocClusterDeleteOperator( task_id='dataproc-destroy-cluster', project_id='silicon-parity-282607', region='us-central1', cluster_name= '{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}',
project_id='makoto0908spark', cluster_name= '{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}' + '4', region='us-west1', execution_timeout=timedelta(minutes=30), trigger_rule=trigger_rule.TriggerRule.ALL_DONE) args = ["--process.date", "{{ (execution_date).strftime('%Y-%m-%d') }}"] unique_user = DataProcSparkOperator( task_id='unique_user', dataproc_spark_jars=['gs://path/jar/CohortAnalysis.jar'], main_class='com.makoto.spark.process.UserGenerateProcess', region='us-west1', job_name=dag_name + 'unique_user', cluster_name= '{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}' + '1', execution_timeout=timedelta(minutes=180), arguments=args) args = ["--process.date", "{{ (execution_date).strftime('%Y-%m-%d') }}"] bike_share_aggregator = DataProcSparkOperator( task_id='bike_share_aggregator', dataproc_spark_jars=['gs://path/jar/CohortAnalysis.jar'], main_class='com.makoto.spark.process.BikeTripProcess', region='us-west1', job_name=dag_name + 'bike_share_aggregator', cluster_name=
def moz_dataproc_jar_runner(parent_dag_name=None, dag_name='run_script_on_dataproc', default_args=None, cluster_name=None, num_workers=2, image_version='1.4', zone='us-west1-b', idle_delete_ttl='14400', auto_delete_ttl='28800', master_machine_type='n1-standard-8', worker_machine_type='n1-standard-4', num_preemptible_workers=0, service_account=None, init_actions_uris=None, optional_components=['ANACONDA'], install_component_gateway=True, jar_urls=None, main_class=None, jar_args=None, job_name=None, aws_conn_id=None, gcp_conn_id='google_cloud_airflow_dataproc'): """ This will initially create a GCP Dataproc cluster with Anaconda/Jupyter/Component gateway. Then we call DataProcSparkOperator to execute the jar defined by the arguments jar_urls and main_class. Once that succeeds, we teardown the cluster. **Example**: :: # Unsalted cluster name so subsequent runs fail if the cluster name exists cluster_name = 'test-dataproc-cluster-hwoo' # Defined in Airflow's UI -> Admin -> Connections gcp_conn_id = 'google_cloud_airflow_dataproc' run_dataproc_jar = SubDagOperator( task_id='run_dataproc_jar', dag=dag, subdag = moz_dataproc_jar_runner( parent_dag_name=dag.dag_id, dag_name='run_dataproc_jar', job_name='Run_some_spark_jar_on_dataproc', default_args=default_args, cluster_name=cluster_name, jar_urls=['gs://some_bucket/some_jar.jar'], main_class='com.mozilla.path.to.ClassName', jar_args=["-d", "{{ ds_nodash }}"], gcp_conn_id=gcp_conn_id) ) Airflow related args: --- See moz_dataproc_pyspark_runner Dataproc Cluster related args: --- See moz_dataproc_pyspark_runner Jar runner related args: --- :param list jar_urls: URIs to jars provisioned in Cloud Storage (example: for UDFs and libs) and are ideal to put in default arguments. :param str main_class: Name of the job class entrypoint to execute. :param list jar_args: Arguments for the job. """ if cluster_name is None or jar_urls is None or main_class is None: raise AirflowException( 'Please specify cluster_name, jar_urls, and/or main_class.') dataproc_helper = DataProcHelper( cluster_name=cluster_name, num_workers=num_workers, image_version=image_version, zone=zone, idle_delete_ttl=idle_delete_ttl, auto_delete_ttl=auto_delete_ttl, master_machine_type=master_machine_type, worker_machine_type=worker_machine_type, num_preemptible_workers=num_preemptible_workers, service_account=service_account, init_actions_uris=init_actions_uris, optional_components=optional_components, install_component_gateway=install_component_gateway, aws_conn_id=aws_conn_id, gcp_conn_id=gcp_conn_id) _dag_name = '{}.{}'.format(parent_dag_name, dag_name) with models.DAG(_dag_name, default_args=default_args) as dag: create_dataproc_cluster = dataproc_helper.create_cluster() # Note - When we upgrade to a later version of Airflow that pulls in latest # DataProcSparkOperator code, use the argument main_jar=jar_url instead, and # remove arguments main_class and dataproc_spark_jars. run_jar_on_dataproc = DataProcSparkOperator( cluster_name=cluster_name, task_id='run_jar_on_dataproc', job_name=job_name, dataproc_spark_jars=jar_urls, main_class=main_class, arguments=jar_args, gcp_conn_id=gcp_conn_id) delete_dataproc_cluster = dataproc_helper.delete_cluster() create_dataproc_cluster >> run_jar_on_dataproc >> delete_dataproc_cluster return dag
project_id='sinuous-set-242504', region='us-west1', master_machine_type='n1-standard-2', worker_machine_type='n1-standard-2', cluster_name= '{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}', num_workers=2) dataproc_spark_process = DataProcSparkOperator( task_id='dataproc-test', dataproc_spark_jars=[ 'gs://jiuzhangsuanfa/SparkProject-assembly-0.1.jar' ], main_class='com.jiuzhang.spark.LoanAnalyze', job_name='loan', region='us-west1', cluster_name= '{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}', arguments=[ "gs://jiuzhangsuanfa/LendingClub/LoanStats_2019Q1.csv", "gs://jiuzhangsuanfa/LendingClub/RejectStats_2019Q1.csv", "gs://jiuzhangsuanfa/output" ]) dataproc_destroy_cluster = DataprocClusterDeleteOperator( task_id='dataproc-destroy-cluster', project_id='sinuous-set-242504', region='us-west1', cluster_name= '{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}', trigger_rule=trigger_rule.TriggerRule.ALL_DONE)
def getSparkJobHandle(taskID, mainClass): return DataProcSparkOperator(task_id=taskID, region=REGION, main_class=mainClass, cluster_name=CLUSTER_NAME, dataproc_spark_jars=[MAIN_JAR, BIG_QUERY_JAR])
task_id='dataproc_destroy_cluster_4', project_id='sinuous-set-242504', cluster_name='{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}' + '4', region='us-west1', execution_timeout=timedelta(minutes=30), trigger_rule=trigger_rule.TriggerRule.ALL_DONE) args = ["--process.date", "{{ (execution_date).strftime('%Y-%m-%d') }}"] unique_user = DataProcSparkOperator( task_id='unique_user', dataproc_spark_jars=[ 'gs://jiuzhangsuanfa/jar/CohortProject-assembly-0.1.jar' ], main_class='com.cohort.process.UserProcess', region='us-west1', job_name=dag_name + 'unique_user', cluster_name='{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}' + '1', execution_timeout=timedelta(minutes=180), arguments=args) args = ["--process.date", "{{ (execution_date).strftime('%Y-%m-%d') }}"] bike_share_aggregator = DataProcSparkOperator( task_id='bike_share_aggregator', dataproc_spark_jars=[ 'gs://jiuzhangsuanfa/jar/CohortProject-assembly-0.1.jar' ], main_class='com.cohort.process.BikeShareProcess', region='us-west1',
query="define sin HiveUDF('sin');", region=REGION, cluster_name=CLUSTER_NAME, ) spark_sql_task = DataProcSparkSqlOperator( task_id="spark_sql_task", query="SHOW DATABASES;", region=REGION, cluster_name=CLUSTER_NAME, ) spark_task = DataProcSparkOperator( task_id="spark_task", main_class="org.apache.spark.examples.SparkPi", dataproc_jars="file:///usr/lib/spark/examples/jars/spark-examples.jar", region=REGION, cluster_name=CLUSTER_NAME, ) pyspark_task = DataProcPySparkOperator( task_id="pyspark_task", main=PYSPARK_URI, region=REGION, cluster_name=CLUSTER_NAME, ) hive_task = DataProcHiveOperator( task_id="hive_task", query="SHOW DATABASES;", region=REGION,
}) # dpso_catalogos_generales = DataProcSparkOperator( # task_id = 'dpso_catalogos_generales', # cluster_name = CLUSTER_NAME, # dataproc_spark_properties = SPARK_SUBMIT_PROPERTIES, # dataproc_spark_jars = [BUCKET_DATAPROC_SPARK_JARS + COMPONENT_CATALOGOS_GENERALES + '.jar'], # main_class = NAMESPACE_MAIN_CLASS + '.' + COMPONENT_CATALOGOS_GENERALES, # arguments = [PLATFORM, ENVIRONMENT_GCP, SIS_ORI_INFO] # ) dpso_vdg_polizas_certificado = DataProcSparkOperator( task_id='dpso_vdg_polizas_certificado', cluster_name=CLUSTER_NAME, dataproc_spark_properties=SPARK_SUBMIT_PROPERTIES, dataproc_spark_jars=[ BUCKET_DATAPROC_SPARK_JARS + COMPONENT_POLIZAS_CERTIFICADO + '.jar' ], main_class=NAMESPACE_MAIN_CLASS + '.' + COMPONENT_POLIZAS_CERTIFICADO, arguments=[PLATFORM, ENVIRONMENT_GCP, SIS_ORI_INFO]) dpso_vdg_siniestros = DataProcSparkOperator( task_id='dpso_vdg_siniestros', cluster_name=CLUSTER_NAME, dataproc_spark_properties=SPARK_SUBMIT_PROPERTIES, dataproc_spark_jars=[ BUCKET_DATAPROC_SPARK_JARS + COMPONENT_SINIESTROS + '.jar' ], main_class=NAMESPACE_MAIN_CLASS + '.' + COMPONENT_SINIESTROS, arguments=[PLATFORM, ENVIRONMENT_GCP, SIS_ORI_INFO])
dag=dag) # ================= # == Spark Jobs === # ================= # define arguments args = ["--args.for.jar", "ThisIsArgs"] calc_unique_users = DataProcSparkOperator( task_id='calc_unique_users', dataproc_spark_jars=[ 'ggs://exampleBucket/jar/yourProject-assembly-0.1.jar' ], main_class='yourProject.com.ActionProcess', region='us-west1', job_name=cleaned_dag_id + 'calc_unique_users', cluster_name= '{{ ti.xcom_pull(key=unique_cluster_name, task_ids="generate_unique_cluster_name") }}' + '1', execution_timeout=timedelta(hours=2), arguments=args, dag=dag) # define arguments args = ["--args.for.jar", "ThisIsArgs"] calc_agg = DataProcSparkOperator( task_id='calc_agg', dataproc_spark_jars=[ 'ggs://exampleBucket/jar/yourProject-assembly-0.1.jar' ],
destroy_spark_cluster = DataprocClusterDeleteOperator( task_id='destroy_spark_cluster', trigger_rule='all_done', execution_timeout=timedelta(minutes=10), cluster_name=gcpClusterName, project_id=gcpProjectName, region=gcpRegion, dag=dag) # Ia ETL Facebook etl_facebook = DataProcSparkOperator( task_id='etl_facebook', execution_timeout=timedelta(minutes=30), cluster_name=gcpClusterName, dataproc_spark_jars=[gcpJar], main_class='etl.Ia_Facebook', arguments=[gcpDataStorage + "/data/facebook-nl.json", gcpDataStorage + "/data/facebook-nl.parquet"], dag=dag) # Ib ETL Factual etl_factual = DataProcSparkOperator( task_id='etl_factual', execution_timeout=timedelta(minutes=30), cluster_name=gcpClusterName, dataproc_spark_jars=[gcpJar], main_class='etl.Ib_Factual', arguments=[gcpDataStorage + "/data/factual-nl.json", gcpDataStorage + "/data/factual-nl.parquet"], dag=dag)
def moz_dataproc_scriptrunner(parent_dag_name=None, dag_name='run_script_on_dataproc', default_args=None, cluster_name=None, num_workers=2, image_version='1.4', zone='us-west1-b', idle_delete_ttl='14400', auto_delete_ttl='28800', master_machine_type='n1-standard-8', worker_machine_type='n1-standard-4', num_preemptible_workers=0, service_account=None, init_actions_uris=None, optional_components=['ANACONDA'], install_component_gateway=True, uri=None, env=None, arguments=None, job_name=None, aws_conn_id=None, gcp_conn_id='google_cloud_airflow_dataproc'): """ This will initially create a GCP Dataproc cluster with Anaconda/Jupyter/Component gateway. Then we execute a script uri (either https or gcs) similar to how we use our custom AWS EmrSparkOperator. This will call DataProcSparkOperator using EMR's script-runner.jar, which then executes the airflow_gcp.sh entrypoint script. The entrypoint script expects another script uri, along with it's arguments, as parameters. Once that succeeds, we teardown the cluster. **Example**: :: # Unsalted cluster name so subsequent runs fail if the cluster name exists cluster_name = 'test-dataproc-cluster-hwoo' # Defined in Airflow's UI -> Admin -> Connections gcp_conn_id = 'google_cloud_airflow_dataproc' run_dataproc_script = SubDagOperator( task_id='run_dataproc_script', dag=dag, subdag = moz_dataproc_scriptrunner( parent_dag_name=dag.dag_id, dag_name='run_dataproc_script', default_args=default_args, cluster_name=cluster_name, job_name='Run_a_script_on_dataproc', uri='https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/some_bash_or_py_script.py', env={"date": "{{ ds_nodash }}"}, arguments="-d {{ ds_nodash }}", gcp_conn_id=gcp_conn_id) ) Airflow related args: --- See moz_dataproc_pyspark_runner Dataproc Cluster related args: --- See moz_dataproc_pyspark_runner Scriptrunner specific args: --- :param str uri: The HTTP or GCS URI of the script to run. Can be .py, .jar, or other type of script (e.g. bash). Is ran via the airflow_gcp.sh entrypoint. Ipynb is no longer supported. :param dict env: If env is not None, it must be a mapping that defines the environment variables for the new process (templated). :param str arguments: Passed to `airflow_gcp.sh`, passed as one long string of space separated args. """ if job_name is None or uri is None or cluster_name is None: raise AirflowException( 'Please specify job_name, uri, and cluster_name.') dataproc_helper = DataProcHelper( cluster_name=cluster_name, num_workers=num_workers, image_version=image_version, zone=zone, idle_delete_ttl=idle_delete_ttl, auto_delete_ttl=auto_delete_ttl, master_machine_type=master_machine_type, worker_machine_type=worker_machine_type, num_preemptible_workers=num_preemptible_workers, service_account=service_account, init_actions_uris=init_actions_uris, optional_components=optional_components, install_component_gateway=install_component_gateway, aws_conn_id=aws_conn_id, gcp_conn_id=gcp_conn_id) _dag_name = '{}.{}'.format(parent_dag_name, dag_name) environment = _format_envvar(env) script_bucket = 'moz-fx-data-prod-airflow-dataproc-artifacts' jar_url = 'gs://{}/bin/script-runner.jar'.format(script_bucket) args = [ 'gs://{}/bootstrap/airflow_gcp.sh'.format(script_bucket), '--job-name', job_name, '--uri', uri, '--environment', environment ] if arguments: args += ['--arguments', arguments] with models.DAG(_dag_name, default_args=default_args) as dag: create_dataproc_cluster = dataproc_helper.create_cluster() # Run DataprocSparkOperator with script-runner.jar pointing to airflow_gcp.sh. # Note - When we upgrade to a later version of Airflow that pulls in latest # DataProcSparkOperator code, use the argument main_jar=jar_url instead, and # remove arguments main_class and dataproc_spark_jars. run_script_on_dataproc = DataProcSparkOperator( cluster_name=cluster_name, task_id='run_script_on_dataproc', job_name=job_name, dataproc_spark_jars=[jar_url], main_class='com.amazon.elasticmapreduce.scriptrunner.ScriptRunner', arguments=args, gcp_conn_id=gcp_conn_id) delete_dataproc_cluster = dataproc_helper.delete_cluster() create_dataproc_cluster >> run_script_on_dataproc >> delete_dataproc_cluster return dag
} # Dict mentioning Spark job's properties DATAPROC_SPARK_JARS = ['gs://example-bucket/runnableJars/example-jar.jar'] date_tuple = dynamic_date( 3 ) # Suppose we are processing 3 days ago's data - mimics a lag in arrival and processing of data run_spark_job = DataProcSparkOperator( dag=dag, arguments=[ "gs://example-source-bucket/year=" + date_tuple['year'] + "/month=" + date_tuple['month'] + "/day=" + date_tuple['day'] + "/*", "gs://example-sink-bucket/dir1/year=" + date_tuple['year'] + "/month=" + date_tuple['month'] + "/day=" + date_tuple['date'] + "/" ], region="us-central1", task_id='example-spark-job', dataproc_spark_jars=DATAPROC_SPARK_JARS, dataproc_spark_properties=DATAPROC_SPARK_PROP, cluster_name='example-{{ ds }}', main_class='[Path-to-Main-Class]', ) load_to_bq = GoogleCloudStorageToBigQueryOperator( bucket="example-bucket", source_objects=[ "gs://example-sink-bucket/dir1/year=" + date_tuple['year'] + "/month=" + date_tuple['month'] + "/day=" + date_tuple['date'] + "/*.parquet" ],
def reprocess_parquet(parent_dag_name, default_args, reprocess, gcp_conn_id, gcs_buckets, objects_prefix, date_submission_col, dataset, dataset_version, gs_dataset_location=None, dataproc_zone='us-central1-a', dag_name='reprocess_parquet', num_preemptible_workers=10): """ Reprocess Parquet datasets to conform with BigQuery Parquet loader. This function should be invoked as part of `load_to_bigquery`. https://github.com/mozilla-services/spark-parquet-to-bigquery/blob/master/src/main/scala/com/mozilla/dataops/spark/TransformParquet.scala ## noqa :param str parent_dag_name: parent dag name :param dict default_args: dag configuration :param str gcp_conn_id: airflow connection id for GCP access :param dict gcp_buckets: source and dest gcp buckets for reprocess :param str dataset: dataset name :param str dataset_version: dataset version :param str object_prefix objects location :param str date_submission_col: dataset date submission column :param str dataproc_zone: GCP zone to launch dataproc clusters :param str dag_name: name of dag :param int num_preemptible_workers: number of dataproc cluster workers to provision :param bool reprocess: enable dataset reprocessing. defaults to False :param str gs_dataset_location: override source location, defaults to None :return airflow.models.DAG """ JAR = [ 'gs://moz-fx-data-derived-datasets-parquet-tmp/jars/spark-parquet-to-bigquery-assembly-1.0.jar' # noqa ] if gs_dataset_location: _gs_dataset_location = gs_dataset_location else: _gs_dataset_location = 'gs://{}/{}'.format(gcs_buckets['transfer'], objects_prefix) cluster_name = '{}-{}'.format(dataset.replace('_', '-'), dataset_version) + '-{{ ds_nodash }}' connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) spark_args = [ '--files', _gs_dataset_location, '--submission-date-col', date_submission_col, '--gcp-project-id', connection.project_id, '--gcs-bucket', 'gs://{}'.format(gcs_buckets['load']), ] _dag_name = '%s.%s' % (parent_dag_name, dag_name) with models.DAG( _dag_name, default_args=default_args) as dag: if reprocess: create_dataproc_cluster = DataprocClusterCreateOperator( task_id='create_dataproc_cluster', cluster_name=cluster_name, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, num_workers=2, image_version='1.3', storage_bucket=gcs_buckets['transfer'], zone=dataproc_zone, master_machine_type='n1-standard-8', worker_machine_type='n1-standard-8', num_preemptible_workers=num_preemptible_workers, metadata={ 'gcs-connector-version': '1.9.6', 'bigquery-connector-version': '0.13.6' }) run_dataproc_spark = DataProcSparkOperator( task_id='run_dataproc_spark', cluster_name=cluster_name, dataproc_spark_jars=JAR, main_class='com.mozilla.dataops.spark.TransformParquet', arguments=spark_args, gcp_conn_id=gcp_conn_id) delete_dataproc_cluster = DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name=cluster_name, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, trigger_rule=trigger_rule.TriggerRule.ALL_DONE) create_dataproc_cluster >> run_dataproc_spark >> delete_dataproc_cluster # noqa else: DummyOperator(task_id='no_reprocess') return dag
# Note: this operator will be flagged a success if the cluster by this name already exists. create_cluster = DataprocClusterCreateOperator( task_id='create_dataproc_cluster', # ds_nodash is an airflow macro for "[Execution] Date string no dashes" # in YYYYMMDD format. See docs https://airflow.apache.org/code.html?highlight=macros#macros cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}', image_version='1.5-debian10', num_workers=2, storage_bucket=CLUSTER_STORAGE_BUCKET, region=REGION, zone=ZONE) # Submit our Spark Job submit_scalaspark = DataProcSparkOperator( task_id=TASK_ID, region=REGION, main_class=MAIN_CLASS, cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}', dataproc_spark_jars=MAIN_JAR) # Delete the Cloud Dataproc cluster. delete_cluster = DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', region=REGION, # Obviously needs to match the name of cluster created in the prior two Operators. cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}', # This will tear down the cluster even if there are failures in upstream tasks. trigger_rule=TriggerRule.ALL_DONE) create_cluster.dag = dag create_cluster.set_downstream(submit_scalaspark)