object='flights/{{ execution_date.format("%Y/%m/%d/%H") }}/_SUCCESS') # Create a Cloud Dataproc cluster. create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', cluster_name='lab8-work-cluster-{{ ds_nodash }}', num_workers=2, zone=models.Variable.get('gce_zone'), master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') # Run Spark job - Popular airports run_dataproc_spark_t1 = dataproc_operator.DataProcSparkOperator( task_id='run_dataproc_spark_task1', dataproc_spark_jars=[TASK1_JAR], job_name='popular_airports', main_class = 'com.github.rmdarth.bdpclab6.PopularAirportsDF', cluster_name='lab8-work-cluster-{{ ds_nodash }}', arguments=task1_args ) # Run Spark job - Canceled flights run_dataproc_spark_t2 = dataproc_operator.DataProcSparkOperator( task_id='run_dataproc_spark_task2', dataproc_spark_jars=[TASK2_JAR], job_name='canceled_flights', main_class = 'com.github.rmdarth.bdpclab6.CanceledFlightsDF', cluster_name='lab8-work-cluster-{{ ds_nodash }}', arguments=task2_args ) # Delete Cloud Dataproc cluster.
task_id="create_dataproc_cluster", cluster_name="gcp-data-platform", num_workers=0, zone="us-west1a", master_machine_type="n1-highmem-4", ) run_dataproc_spark = dataproc_operator.DataProcSparkOperator( task_id="events_dataproc", cluster_name="gcp-data-platform", region=REGION, main_class="io.dagster.events.EventPipeline", dataproc_spark_jars=[ "%s/events-assembly-%s.jar" % (DEPLOY_BUCKET_PREFIX, LATEST_JAR_HASH) ], arguments=[ "--gcs-input-bucket", INPUT_BUCKET, "--gcs-output-bucket", OUTPUT_BUCKET, "--date", "{{ ds }}", ], ) delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( project_id=PROJECT_ID, task_id="delete_dataproc_cluster", cluster_name="gcp-data-platform", trigger_rule=trigger_rule.TriggerRule.ALL_DONE, )
task_id='create_dataproc_cluster', cluster_name='gcp-data-platform', num_workers=0, zone='us-west1a', master_machine_type='n1-highmem-4', ) run_dataproc_spark = dataproc_operator.DataProcSparkOperator( task_id='events_dataproc', cluster_name='gcp-data-platform', region=REGION, main_class='io.dagster.events.EventPipeline', dataproc_spark_jars=[ '%s/events-assembly-%s.jar' % (DEPLOY_BUCKET_PREFIX, LATEST_JAR_HASH) ], arguments=[ '--gcs-input-bucket', INPUT_BUCKET, '--gcs-output-bucket', OUTPUT_BUCKET, '--date', '{{ ds }}', ], ) delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( project_id=PROJECT_ID, task_id='delete_dataproc_cluster', cluster_name='gcp-data-platform', trigger_rule=trigger_rule.TriggerRule.ALL_DONE, )
'retries': 1, 'retry_delay': dt.timedelta(seconds=30), 'project_id': models.Variable.get('gcp_project') } with DAG('dataproc_spark_submit', schedule_interval='0 17 * * *', default_args=default_dag_args) as dag: create_dataproc_cluster = dpo.DataprocClusterCreateOperator( project_id = default_dag_args['project_id'], task_id = 'create_dataproc_cluster', cluster_name = CLUSTER_NAME, num_workers = 2, zone = models.Variable.get('gce_zone') ) run_spark_job = dpo.DataProcSparkOperator( task_id = 'run_spark_job', #main_jar = MAIN_JAR, main_class = MAIN_CLASS, cluster_name = CLUSTER_NAME ) delete_dataproc_cluster = dpo.DataprocClusterDeleteOperator( project_id = default_dag_args['project_id'], task_id = 'delete_dataproc_cluster', cluster_name = CLUSTER_NAME, trigger_rule = trigger_rule.TriggerRule.ALL_DONE ) create_dataproc_cluster >> run_spark_job >> delete_dataproc_cluster
object=full_path, timeout=3600) # Create a Cloud Dataproc cluster. create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', # Give the cluster a unique name by appending the date scheduled. # See https://airflow.apache.org/code.html#default-variables cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}', num_workers=2, zone=models.Variable.get('gce_zone'), master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') run_dataproc_spark = dataproc_operator.DataProcSparkOperator( task_id='run_dataproc_spark', main_jar=SPRARK_JAR, cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}', arguments=spark_args) # Delete Cloud Dataproc cluster. delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}', # Setting trigger_rule to ALL_DONE causes the cluster to be deleted # even if the Dataproc job fails. trigger_rule=trigger_rule.TriggerRule.ALL_DONE) # Define DAG dependencies. gcs_file_sensor>>create_dataproc_cluster >>run_dataproc_spark >> delete_dataproc_cluster
# Create a Cloud Dataproc cluster. create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', # Give the cluster a unique name by appending the date scheduled. # See https://airflow.apache.org/code.html#default-variables cluster_name='macys-cluster-{{ ds_nodash }}', num_workers=2, zone=models.Variable.get('gce_zone'), master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') # Run the Spark wordcount example installed on the Cloud Dataproc cluster # master node. run_dataproc_spark = dataproc_operator.DataProcSparkOperator( task_id='run_dataproc_spark', main_jar=WORDCOUNT_JAR, cluster_name='macys-cluster-{{ ds_nodash }}', arguments=input_file) # Delete Cloud Dataproc cluster. delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name='macys-cluster-{{ ds_nodash }}', # Setting trigger_rule to ALL_DONE causes the cluster to be deleted # even if the Dataproc job fails. trigger_rule=trigger_rule.TriggerRule.ALL_DONE) # Define DAG dependencies. create_dataproc_cluster >> run_dataproc_spark >> delete_dataproc_cluster
# Create a Cloud Dataproc cluster. create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', cluster_name='composer-data-integration-cluster-{{ ds_nodash }}', storage_bucket='listery-staging', num_workers=2, master_disk_size=20, worker_disk_size=20, num_preemptible_workers=1, zone='us-east1-c', master_machine_type='n1-standard-4', worker_machine_type='n1-standard-4') run_integration_job = dataproc_operator.DataProcSparkOperator( task_id='run_integration_job', main_jar=SPARK_JOBS_JAR, cluster_name='composer-data-integration-cluster-{{ ds_nodash }}', arguments=["--local", "false", "--subprogram", "integration"]) offer_integration = dataproc_operator.DataProcSparkOperator( task_id='offer_integration', main_jar=SPARK_JOBS_JAR, cluster_name='composer-data-integration-cluster-{{ ds_nodash }}', arguments=["--local", "false", "--subprogram", "offerIntegration"]) es_refresh = dataproc_operator.DataProcSparkOperator( task_id='es_refresh', main_jar=SPARK_JOBS_JAR, cluster_name='composer-data-integration-cluster-{{ ds_nodash }}', arguments=["--local", "false", "--subprogram", "refreshEs"])