default_args=default_dag_args) as dag: # Create a Cloud Dataproc cluster. create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', # Give the cluster a unique name by appending the date scheduled. # See https://airflow.apache.org/code.html#default-variables cluster_name='quickstart-cluster-{{ ds_nodash }}', num_workers=2, zone=models.Variable.get('gce_zone'), master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster # master node. run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator( task_id='run_dataproc_hadoop', main_jar=WORDCOUNT_JAR, cluster_name='quickstart-cluster-{{ ds_nodash }}', arguments=wordcount_args) # Delete Cloud Dataproc cluster. delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name='quickstart-cluster-{{ ds_nodash }}', # Setting trigger_rule to ALL_DONE causes the cluster to be deleted # even if the Dataproc job fails. trigger_rule=trigger_rule.TriggerRule.ALL_DONE) # Define DAG dependencies. create_dataproc_cluster >> run_dataproc_hadoop >> delete_dataproc_cluster
default_args=default_dag_args) as dag: mapreduce_cluster_name = 'airflow-mapreduce-cluster' create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', cluster_name=mapreduce_cluster_name, num_workers=2, zone=gce_zone, master_machine_type=machine_type, worker_machine_type=machine_type) hadoop_job = dataproc_operator.DataProcHadoopOperator( task_id='hadoop_job', cluster_name=mapreduce_cluster_name, main_jar=hadoop_job_jar_uri, arguments=[ collisions_dataset_uri, f'{hadoop_job_output_bucket}/{exec_dt}' ]) hive_job = dataproc_operator.DataProcHiveOperator( task_id='hive_job', cluster_name=mapreduce_cluster_name, dataproc_hive_jars=[hive_hcatalog_jar_uri], query_uri=hive_job_hql_uri, variables={ 'collisions_job_output_bucket': f'{hadoop_job_output_bucket}/{exec_dt}', 'hive_job_output_bucket': f'{hive_job_output_bucket}/{exec_dt}', 'hive_hcatalog_jar': hive_hcatalog_jar_uri, 'zips_boroughs_bucket': zips_boroughs_bucket_uri }
'enable-cloud-sql-hive-metastore': 'false', 'additional-cloud-sql-instances': 'dannydataproc:us-central1:sqooptest', 'hive-metastore-instance': 'dannydataproc:us-central1:sqooptest' }) # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster # master node. run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator( task_id='run_dataproc_hadoop', main_class='org.apache.sqoop.Sqoop', dataproc_hadoop_jars=[ 'gs://dannydataproc/sqoop-1.4.7-hadoop260.jar', 'file:///usr/share/java/mysql-connector-java-5.1.42.jar', 'gs://dannydataproc/avro-tools-1.8.2.jar' ], arguments=[ 'import', '-Dmapreduce.job.user.classpath.first=true', '--connect=jdbc:mysql://127.0.0.1/guestbook', '--username=sqoop', '--password-file=gs://dannydataproc/passwordFile.txt', '--target-dir=gs://dannydataproc/entities', '--table=entries', '--as-avrodatafile', '--delete-target-dir' ], cluster_name='sqoop-cluster-{{ ds_nodash }}') # Delete Cloud Dataproc cluster. delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name='sqoop-cluster-{{ ds_nodash }}', # Setting trigger_rule to ALL_DONE causes the cluster to be deleted # even if the Dataproc job fails. trigger_rule=trigger_rule.TriggerRule.ALL_DONE)
# Create a Cloud Dataproc cluster. create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', # Give the cluster a unique name by appending the date scheduled. # See https://airflow.apache.org/code.html#default-variables cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}', num_workers=2, region='us-central1', zone=models.Variable.get('gce_zone'), image_version='2.0', master_machine_type='n1-standard-2', worker_machine_type='n1-standard-2') # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster # master node. run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator( task_id='run_dataproc_hadoop', region='us-central1', main_jar=WORDCOUNT_JAR, cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}', arguments=wordcount_args) # Delete Cloud Dataproc cluster. delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', region='us-central1', cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}', # Setting trigger_rule to ALL_DONE causes the cluster to be deleted # even if the Dataproc job fails. trigger_rule=trigger_rule.TriggerRule.ALL_DONE) # Define DAG dependencies. create_dataproc_cluster >> run_dataproc_hadoop >> delete_dataproc_cluster
# Create a Cloud Dataproc cluster. create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', # Give the cluster a unique name by appending the date scheduled. # See https://airflow.apache.org/code.html#default-variables cluster_name=naming_cluster + '-{{ ds_nodash }}', num_workers=2, zone=var_zone, master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster # master node. run_dataproc_spark = dataproc_operator.DataProcHadoopOperator( task_id='run_dataproc_hadoop', main_jar=SPARK_JAR, cluster_name=naming_cluster + '-{{ ds_nodash }}', arguments=executor_args) # Delete Cloud Dataproc cluster. delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name=naming_cluster + '-{{ ds_nodash }}', # Setting trigger_rule to ALL_DONE causes the cluster to be deleted # even if the Dataproc job fails. trigger_rule=trigger_rule.TriggerRule.ALL_DONE) # [START composer_hadoop_steps] # Define DAG dependencies. create_dataproc_cluster >> run_dataproc_spark >> delete_dataproc_cluster # [END composer_hadoop_steps]