def terminate_engine(cls): from airflow.contrib.hooks.gcp_dataproc_hook import DataProcHook from airflow.contrib.operators import dataproc_operator dataproc_config = DataprocConfig() gcp_conn_id = get_settings().get_env_config(CloudType.gcp).conn_id cluster_hook = DataProcHook(gcp_conn_id=gcp_conn_id) delete_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id="delete_dataproc_cluster", cluster_name=dataproc_config.cluster, project_id=cluster_hook.project_id, gcp_conn_id=gcp_conn_id, region=dataproc_config.region, ) return delete_cluster
'LSTM_DATAGEN', '--project', 'hd-personalization-dev', '--category', 'AreaRugs', '--dupletsData', 'gs://hd-personalization-dev-data/vdc2136/training/duplets/2020-06-01/', '--featuresData', 'gs://hd-personalization-dev-data/vdc2136/training/data/AllFeatures.csv', '--finalOutputPath', 'gs://hd-personalization-dev-data/vdc2136/training/lstm/2020-06-02/', '--appName', 'LSTM_DATA_GEN', '--mode=cluster' ], job_name='airflow_pyspark_job', cluster_name='airflow-dataproc-cluster', project_id='hd-personalization-dev', dag=dag) delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name='airflow-dataproc-cluster', # Setting trigger_rule to ALL_DONE causes the cluster to be deleted # even if the Dataproc job fails. trigger_rule=trigger_rule.TriggerRule.ALL_DONE, project_id='hd-personalization-dev', dag=dag) start_dag = DummyOperator( task_id='start', default_args=default_args, dag=dag, ) start_dag >> create_dataproc_cluster >> dataproc_pyspark_submit >> delete_dataproc_cluster
"%s/events-assembly-%s.jar" % (DEPLOY_BUCKET_PREFIX, LATEST_JAR_HASH) ], arguments=[ "--gcs-input-bucket", INPUT_BUCKET, "--gcs-output-bucket", OUTPUT_BUCKET, "--date", "{{ ds }}", ], ) delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( project_id=PROJECT_ID, task_id="delete_dataproc_cluster", cluster_name="gcp-data-platform", trigger_rule=trigger_rule.TriggerRule.ALL_DONE, ) gcs_to_bigquery = GoogleCloudStorageToBigQueryOperator( task_id="gcs_to_bigquery", bucket=OUTPUT_BUCKET, source_objects=['{{ ds_format(ds, "%Y/%m/%d") }}/*.parquet'], destination_project_dataset_table= "{project_id}.events.events{{ ds_nodash }}".format( project_id=PROJECT_ID), source_format="PARQUET", create_disposition="CREATE_IF_NEEDED", write_disposition="WRITE_TRUNCATE", )
init_actions_uris=[ 'gs://goog-dataproc-initialization-actions-europe-west1/python/pip-install.sh' ]) run_batch_kpi_scheduled = dataproc_operator.DataProcPySparkOperator( task_id="submit_batch-kpi-scheduled", cluster_name='vf-polimi-demo', region='europe-west1', main='gs://vf-polimi-batch-data/dev/compute-kpi-batch.py', dataproc_pyspark_jars= 'gs://spark-lib/bigquery/spark-bigquery-latest.jar', xcom_push=True) remove_cluster = dataproc_operator.DataprocClusterDeleteOperator( project_id=PROJECT, task_id="delete_cluster", cluster_name='vf-polimi-demo', region='europe-west1') def check_batch_kpi_scheduled_cluster_running(**kwargs): ti = kwargs['ti'] xcom_value = ti.xcom_pull(task_ids='batch_kpi_scheduled_cluster') if xcom_value == "vf-polimi-demo": return 'delete_cluster' else: return 'end' branch_batch_kpi_scheduled_active_cluster = BranchPythonOperator( task_id='check_batch_kpi_scheduled_cluster', provide_context=True, python_callable=check_batch_kpi_scheduled_cluster_running)
# BashOperator # A simple print date print_date = BashOperator(task_id='print_date', bash_command='date') # dataproc_operator # Create small dataproc cluster create_dataproc = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc', cluster_name='dataproc-cluster-demo-{{ ds_nodash }}', num_workers=2, zone=models.Variable.get('dataproc_zone'), master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') # Run the PySpark job run_spark = dataproc_operator.DataProcPySparkOperator( task_id='run_spark', main=SPARK_CODE, cluster_name='dataproc-cluster-demo-{{ ds_nodash }}', job_name=dataproc_job_name) # dataproc_operator # Delete Cloud Dataproc cluster. delete_dataproc = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc', cluster_name='dataproc-cluster-demo-{{ ds_nodash }}', trigger_rule=trigger_rule.TriggerRule.ALL_DONE) # STEP 6: Set DAGs dependencies # Each task should run after have finished the task before. print_date >> create_dataproc >> run_spark >> delete_dataproc
default_args=default_dag_args) as dag: # Create a Cloud Dataproc cluster. create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', # Give the cluster a unique name by appending the date scheduled. # See https://airflow.apache.org/code.html#default-variables cluster_name='quickstart-cluster-{{ ds_nodash }}', num_workers=2, zone=models.Variable.get('gce_zone'), master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster # master node. run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator( task_id='run_dataproc_hadoop', main_jar=WORDCOUNT_JAR, cluster_name='quickstart-cluster-{{ ds_nodash }}', arguments=wordcount_args) # Delete Cloud Dataproc cluster. delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name='quickstart-cluster-{{ ds_nodash }}', # Setting trigger_rule to ALL_DONE causes the cluster to be deleted # even if the Dataproc job fails. trigger_rule=trigger_rule.TriggerRule.ALL_DONE) # Define DAG dependencies. create_dataproc_cluster >> run_dataproc_hadoop >> delete_dataproc_cluster
# dataproc_operator # Create small dataproc cluster create_dataproc = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc', cluster_name='dataproc-cluster-demo-{{ ds_nodash }}', num_workers=2, zone=None, master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1', region=models.Variable.get('dataproc_zone')) # Run the PySpark job run_spark = dataproc_operator.DataProcPySparkOperator( task_id='run_spark', main=SPARK_CODE, cluster_name='dataproc-cluster-demo-{{ ds_nodash }}', job_name=dataproc_job_name, region=models.Variable.get('dataproc_zone')) # dataproc_operator # Delete Cloud Dataproc cluster. delete_dataproc = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc', cluster_name='dataproc-cluster-demo-{{ ds_nodash }}', trigger_rule=trigger_rule.TriggerRule.ALL_DONE, region=models.Variable.get('dataproc_zone')) # STEP 6: Set DAGs dependencies # Each task should run after have finished the task before. print_date >> create_dataproc >> run_spark >> delete_dataproc
cluster_name='composer-dataproc-{{ ds_nodash }}', num_workers=2, region='asia-south1', zone='asia-south1-a', master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') dataprod_pyspark = dataproc_operator.DataProcPySparkOperator( task_id='pyspark', main='gs://code_deploy/dataproc_read_bucket_to_bigquery.py', cluster_name='composer-dataproc-{{ ds_nodash }}', region='asia-south1', dataproc_pyspark_jars=[]) delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name='composer-dataproc-{{ ds_nodash }}', region='asia-south1', # Setting trigger_rule to ALL_DONE causes the cluster to be deleted # even if the Dataproc job fails. trigger_rule=trigger_rule.TriggerRule.ALL_DONE) # An instance of an operator is called a task. In this case, the # hello_python task calls the "greeting" Python function. hello_python = python_operator.PythonOperator(task_id='hello', python_callable=greeting) # Likewise, the goodbye_bash task calls a Bash script. goodbye_bash = bash_operator.BashOperator(task_id='bye', bash_command='echo Goodbye.') # Define the order in which the tasks complete by using the >> and << # operators. In this example, hello_python executes before goodbye_bash.
# Create a Cloud Dataproc cluster. create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', # Give the cluster a unique name by appending the date scheduled. # See https://airflow.apache.org/code.html#default-variables cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}', num_workers=2, region='us-central1', zone=models.Variable.get('gce_zone'), image_version='2.0', master_machine_type='n1-standard-2', worker_machine_type='n1-standard-2') # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster # master node. run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator( task_id='run_dataproc_hadoop', region='us-central1', main_jar=WORDCOUNT_JAR, cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}', arguments=wordcount_args) # Delete Cloud Dataproc cluster. delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', region='us-central1', cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}', # Setting trigger_rule to ALL_DONE causes the cluster to be deleted # even if the Dataproc job fails. trigger_rule=trigger_rule.TriggerRule.ALL_DONE) # Define DAG dependencies. create_dataproc_cluster >> run_dataproc_hadoop >> delete_dataproc_cluster
arguments=[ "--dataproc=1.4", "--job_date={{ ds }}", "--bucket=dataproc_dataops_tmp" ]) run_pyspark_job_frequency = dataproc_operator.DataProcPySparkOperator( task_id='run_pyspark_job_frequency', dag=dag, main='gs://' + Variable.get('v_composer_bucket') + '/dags/dataproc/twitterPySparkFrequency.py', cluster_name='twitter-dataproc-mlanciau-{{ ds_nodash }}', dataproc_pyspark_jars=[ 'gs://spark-lib/bigquery/spark-bigquery-latest.jar' ], arguments=[ "--dataproc=1.4", "--job_date={{ ds }}", "--bucket=dataproc_dataops_tmp" ]) # Delete Cloud Dataproc cluster. delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', dag=dag, project_id=os.environ.get('GCP_PROJECT'), cluster_name='twitter-dataproc-mlanciau-{{ ds_nodash }}' #, #trigger_rule=trigger_rule.TriggerRule.ALL_DONE ) create_dataproc_cluster >> delete_ml_partition >> run_pyspark_job_splitting >> delete_dataproc_cluster create_dataproc_cluster >> run_pyspark_job_frequency >> delete_dataproc_cluster
cluster_name='composer-311-complaints-{{ ds_nodash }}', num_workers=2, region=models.Variable.get('region'), zone=models.Variable.get('gce_zone'), project_id=models.Variable.get('project_id'), master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') run_dataproc_job = dataproc_operator.DataProcPySparkOperator( task_id="run_dataproc_job", main="gs://311-complaints-spark_jobs/spark_job.py", cluster_name='composer-311-complaints-{{ ds_nodash }}', region=models.Variable.get('region'), dataproc_pyspark_jars=['gs://spark-lib/bigquery/spark-bigquery-latest.jar'], arguments=['gs://{{ dag_run.conf.get("bucket") }}/{{ dag_run.conf.get("name") }}']) delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name='composer-311-complaints-{{ ds_nodash }}', project_id=models.Variable.get('project_id'), region=models.Variable.get('region'), trigger_rule=trigger_rule.TriggerRule.ALL_DONE) bigquery_transformations=BigQueryOperator( sql='/sql/job.sql', task_id='bigquery_transformations', use_legacy_sql=False, ) create_dataproc_cluster >> run_dataproc_job >> delete_dataproc_cluster >> bigquery_transformations
'retries': 1, 'retry_delay': dt.timedelta(seconds=30), 'project_id': models.Variable.get('gcp_project') } with DAG('dataproc_spark_submit', schedule_interval='0 17 * * *', default_args=default_dag_args) as dag: create_dataproc_cluster = dpo.DataprocClusterCreateOperator( project_id = default_dag_args['project_id'], task_id = 'create_dataproc_cluster', cluster_name = CLUSTER_NAME, num_workers = 2, zone = models.Variable.get('gce_zone') ) run_spark_job = dpo.DataProcSparkOperator( task_id = 'run_spark_job', #main_jar = MAIN_JAR, main_class = MAIN_CLASS, cluster_name = CLUSTER_NAME ) delete_dataproc_cluster = dpo.DataprocClusterDeleteOperator( project_id = default_dag_args['project_id'], task_id = 'delete_dataproc_cluster', cluster_name = CLUSTER_NAME, trigger_rule = trigger_rule.TriggerRule.ALL_DONE ) create_dataproc_cluster >> run_spark_job >> delete_dataproc_cluster
# Give the cluster a unique name by appending the date scheduled. # See https://airflow.apache.org/code.html#default-variables cluster_name=pipeline_cluster_name, num_workers=2, region='us-central1', autoscaling_policy= 'projects/{}/regions/us-central1/autoscalingPolicies/ephimeral-scaling-policy' .format(os.environ['PROJECT_ID']), master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') run_py_spark = dataproc_operator.DataProcPySparkOperator( task_id='run_py_spark', region='us-central1', main='gs://{}/data/compute-pi-pipeline/calculate-pi.py'.format( os.environ['COMPOSER_BUCKET']), arguments=[models.Variable.get("NUM_SAMPLES")], cluster_name=pipeline_cluster_name) # Delete Cloud Dataproc cluster. delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', region='us-central1', cluster_name=pipeline_cluster_name, # Setting trigger_rule to ALL_DONE causes the cluster to be deleted # even if the Dataproc job fails. trigger_rule=trigger_rule.TriggerRule.ALL_DONE) # Define DAG dependencies. create_dataproc_cluster >> run_py_spark >> delete_dataproc_cluster # [END composer_hadoop_steps]
job=PRODUCTS_STAGING_SPARK_JOB, location=REGION, project_id=PROJECT_ID) transactions_staging_spark_job = DataprocSubmitJobOperator( task_id="transactions_staging_spark_job", job=TRANSACTIONS_STAGING_SPARK_JOB, location=REGION, project_id=PROJECT_ID) enrich_staging_spark_job = DataprocSubmitJobOperator( task_id="enrich_staging_spark_job", job=ENRICH_STAGING_SPARK_JOB, location=REGION, project_id=PROJECT_ID) delete_dataproc_acme_sales_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id="delete_dataproc_acme_sales_cluster", cluster_name=DATAPROC_CLUSTER_NAME, region=REGION, # Setting trigger_rule to ALL_DONE causes the cluster to be deleted # even if the Dataproc job fails. trigger_rule=trigger_rule.TriggerRule.ALL_DONE, project_id=PROJECT_ID) create_dataproc_acme_sales_cluster >> [locations_staging_spark_job,products_staging_spark_job,transactions_staging_spark_job] >> enrich_staging_spark_job >> delete_dataproc_acme_sales_cluster if __name__ == '__main__': dag.clear(dag_run_state=State.NONE) dag.run()
worker_machine_type=machine_type) hadoop_job = dataproc_operator.DataProcHadoopOperator( task_id='hadoop_job', cluster_name=mapreduce_cluster_name, main_jar=hadoop_job_jar_uri, arguments=[ collisions_dataset_uri, f'{hadoop_job_output_bucket}/{exec_dt}' ]) hive_job = dataproc_operator.DataProcHiveOperator( task_id='hive_job', cluster_name=mapreduce_cluster_name, dataproc_hive_jars=[hive_hcatalog_jar_uri], query_uri=hive_job_hql_uri, variables={ 'collisions_job_output_bucket': f'{hadoop_job_output_bucket}/{exec_dt}', 'hive_job_output_bucket': f'{hive_job_output_bucket}/{exec_dt}', 'hive_hcatalog_jar': hive_hcatalog_jar_uri, 'zips_boroughs_bucket': zips_boroughs_bucket_uri } ) delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name=mapreduce_cluster_name, trigger_rule=trigger_rule.TriggerRule.ALL_DONE) create_dataproc_cluster >> hadoop_job >> hive_job >> delete_dataproc_cluster
'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': datetime.timedelta(minutes=5), 'project_id': models.Variable.get('gcp_project') } with models.DAG('composer_hadoop_wordcount', schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', cluster_name='spikey-wordcount-cluster-{{ ds_nodash }}', num_workers=2, zone=models.Variable.get('gce_zone'), master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator( task_id='run_dataproc_hadoop', main_jar=WORDCOUNT_JAR, cluster_name='spikey-wordcount-cluster-{{ ds_nodash }}', arguments=wordcount_args) delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name='spikey-wordcount-cluster-{{ ds_nodash }}', trigger_rule=trigger_rule.TriggerRule.ALL_DONE) create_dataproc_cluster >> run_dataproc_hadoop >> delete_dataproc_cluster
#network_uri = 'default', subnetwork_uri = subnet, properties = cluster_properties, on_success_callback=task_success_slack_alert, trigger_rule = trigger_rule.TriggerRule.ALL_SUCCESS ) #Run spark job on the above cluster. run_spark_job = BashOperator( task_id = 'run_spark_job', bash_command = bash_command, dag=dag, on_success_callback=task_success_slack_alert, trigger_rule = trigger_rule.TriggerRule.ALL_SUCCESS ) #Delete the cluster. delete_dataproc_cluster = dpo.DataprocClusterDeleteOperator( project_id = projectID, task_id = 'delete_dataproc_cluster', cluster_name = cluster_name, region = region, #zone = 'us-central1-a', #network_uri = 'default', subnetwork_uri = subnet, on_success_callback=task_success_slack_alert, trigger_rule = trigger_rule.TriggerRule.ALL_DONE ) request_job >> create_dataproc_cluster >> run_spark_job >> delete_dataproc_cluster
create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', # Give the cluster a unique name by appending the date scheduled. # See https://airflow.apache.org/code.html#default-variables cluster_name='parquetconverter2', num_workers=3, zone='europe-west1-b', master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') # Run the pyspark CSV2PARQUET example run_dataproc_csv2parquet = dataproc_operator.DataProcPySparkOperator( task_id='run_dataproc_parquetconvert', cluster_name='parquetconverter2', main='gs://alex-code/convert.py') # Delete Cloud Dataproc cluster. delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name='parquetconverter2', # Setting trigger_rule to ALL_DONE causes the cluster to be deleted # even if the Dataproc job fails. trigger_rule=trigger_rule.TriggerRule.ALL_DONE) # [START composer_quickstart_steps] # Define DAG dependencies. create_dataproc_cluster >> run_dataproc_csv2parquet >> delete_dataproc_cluster # [END composer_quickstart_steps] # [END composer_quickstart]
"INSERT INTO TABLE default.chicago_taxi_trips_parquet_autotestbq9 SELECT * FROM default.chicago_taxi_trips_csv_autotestbq9;", cluster_name='dataproc', region='us-west1', dag=dag) dataproc_hive_count_table_csv = DataProcHiveOperator( task_id='dataproc_hive_count_table_csv', gcp_conn_id='google_cloud_default', query="select count(*) from default.chicago_taxi_trips_csv_autotestbq9", cluster_name='dataproc', region='us-west1', dag=dag) delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name='dataproc', region='us-west1', trigger_rule=trigger_rule.TriggerRule.ALL_DONE) load_parquet_bqt = GoogleCloudStorageToBigQueryOperator( task_id='load_parquet_bqt', bucket='dphivedb', source_objects=['HQL/PARQUET/*'], schema_fields=None, schema_object=None, source_format='parquet', destination_project_dataset_table='bqdataset.test3', bigquery_conn_id='bigquery_default', google_cloud_storage_conn_id='google_cloud_default', write_disposition='WRITE_TRUNCATE', autodetect=True,
cluster_name='composer-data-integration-cluster-{{ ds_nodash }}', arguments=["--local", "false", "--subprogram", "integration"]) offer_integration = dataproc_operator.DataProcSparkOperator( task_id='offer_integration', main_jar=SPARK_JOBS_JAR, cluster_name='composer-data-integration-cluster-{{ ds_nodash }}', arguments=["--local", "false", "--subprogram", "offerIntegration"]) es_refresh = dataproc_operator.DataProcSparkOperator( task_id='es_refresh', main_jar=SPARK_JOBS_JAR, cluster_name='composer-data-integration-cluster-{{ ds_nodash }}', arguments=["--local", "false", "--subprogram", "refreshEs"]) price_diff = dataproc_operator.DataProcSparkOperator( task_id='price_diff', main_jar=SPARK_JOBS_JAR, cluster_name='composer-data-integration-cluster-{{ ds_nodash }}', arguments=["--local", "false", "--subprogram", "priceDiff"]) delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name='composer-data-integration-cluster-{{ ds_nodash }}', trigger_rule=trigger_rule.TriggerRule.ALL_DONE) create_dataproc_cluster >> [run_integration_job, offer_integration] run_integration_job >> price_diff [run_integration_job, offer_integration] >> es_refresh [price_diff, es_refresh] >> delete_dataproc_cluster