def test_execute(self, mock_hook): template_id = "template_id" version = 6 parameters = {} op = DataprocInstantiateWorkflowTemplateOperator( task_id=TASK_ID, template_id=template_id, region=GCP_LOCATION, project_id=GCP_PROJECT, version=version, parameters=parameters, request_id=REQUEST_ID, retry=RETRY, timeout=TIMEOUT, metadata=METADATA, gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN, ) op.execute(context={}) mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN) mock_hook.return_value.instantiate_workflow_template.assert_called_once_with( template_name=template_id, location=GCP_LOCATION, project_id=GCP_PROJECT, version=version, parameters=parameters, request_id=REQUEST_ID, retry=RETRY, timeout=TIMEOUT, metadata=METADATA, )
) # [END how_to_cloud_dataproc_update_cluster_operator] # [START how_to_cloud_dataproc_create_workflow_template] create_workflow_template = DataprocCreateWorkflowTemplateOperator( task_id="create_workflow_template", template=WORKFLOW_TEMPLATE, project_id=PROJECT_ID, region=REGION, ) # [END how_to_cloud_dataproc_create_workflow_template] # [START how_to_cloud_dataproc_trigger_workflow_template] trigger_workflow = DataprocInstantiateWorkflowTemplateOperator( task_id="trigger_workflow", region=REGION, project_id=PROJECT_ID, template_id=WORKFLOW_NAME) # [END how_to_cloud_dataproc_trigger_workflow_template] pig_task = DataprocSubmitJobOperator(task_id="pig_task", job=PIG_JOB, region=REGION, project_id=PROJECT_ID) spark_sql_task = DataprocSubmitJobOperator(task_id="spark_sql_task", job=SPARK_SQL_JOB, region=REGION, project_id=PROJECT_ID) spark_task = DataprocSubmitJobOperator(task_id="spark_task", job=SPARK_JOB,
default_args = { # Tell airflow to start one day ago, so that it runs as soon as you upload it "start_date": days_ago(1), "project_id": project_id, } # Define a DAG (directed acyclic graph) of tasks. # Any task you create within the context manager is automatically added to the # DAG object. with models.DAG( # The id you will see in the DAG airflow page "dataproc_workflow_dag", default_args=default_args, # The interval with which to schedule the DAG schedule_interval=datetime.timedelta( days=1), # Override to match your needs ) as dag: start_template_job = DataprocInstantiateWorkflowTemplateOperator( # The task id of your job task_id="dataproc_workflow_dag", # The template id of your workflow template_id="sparkpi", project_id=project_id, # The region for the template region="us-central1", ) # [END composer_dataproc_workflow_instantiate_operator_tutorial]
task_id='transfer_grabbed_data', source_bucket=os.environ['GCP_GCS_BUCKET_LANDING'], destination_bucket=os.environ['GCP_GCS_BUCKET_WORKING'], gcp_conn_id='gr_storage_conn', source_object='{{ run_id }}', destination_object='{{ run_id }}', ) spark_etl = DataprocInstantiateWorkflowTemplateOperator( task_id='spark_etl', template_id=os.environ['GCP_DATAPROC_TEMPLATE_ID'], project_id=os.environ['GCP_PROJECT_ID'], region=os.environ['GCP_REGION'], parameters={ 'PATH_TO_ETL_FILE': f"gs://{os.environ['GCP_GCS_BUCKET_ROUTINE']}/etl/etl.py", 'CLUSTER_NAME': 'goodreads-etl', 'ARG_SOURCE_BUCKET': os.environ['GCP_GCS_BUCKET_WORKING'], 'ARG_DESTINATION_BUCKET': os.environ['GCP_GCS_BUCKET_PROCESSED'], 'ARG_OBJECT_PREFIX': '{{ run_id }}', }, gcp_conn_id='gr_dataproc_conn', ) drop_stage_user_data = BigQueryDeleteTableOperator( task_id="drop_stage_user_data", deletion_dataset_table= f"{GCP_PROJECT_ID}.{GCP_BQ_DATASET_STAGE}.{GCP_BQ_TABLE_USERS}", gcp_conn_id='gr_bigquery_conn', ignore_if_missing=True)