def test_workflow(self): with patch(HOOK) as MockHook: hook = MockHook() hook.get_conn.return_value = self.mock_conn hook.wait.return_value = None dataproc_task = DataprocWorkflowTemplateInstantiateOperator( task_id=TASK_ID, project_id=PROJECT_ID, region=REGION, template_id=TEMPLATE_ID, dag=self.dag) dataproc_task.execute(None) template_name = ('projects/test-project-id/regions/test-region/' 'workflowTemplates/template-id') self.mock_workflows.instantiate.assert_called_once_with( name=template_name, body=mock.ANY) hook.wait.assert_called_once_with(self.operation)
def test_workflow(self): with patch(HOOK) as MockHook: hook = MockHook() hook.get_conn.return_value = self.mock_conn hook.wait.return_value = None dataproc_task = DataprocWorkflowTemplateInstantiateOperator( task_id=TASK_ID, project_id=GCP_PROJECT_ID, region=GCP_REGION, template_id=TEMPLATE_ID, dag=self.dag ) dataproc_task.execute(None) template_name = ( 'projects/test-project-id/regions/test-region/' 'workflowTemplates/template-id') self.mock_workflows.instantiate.assert_called_once_with( name=template_name, body=mock.ANY) hook.wait.assert_called_once_with(self.operation)
from airflow import DAG from airflow.models import Variable from airflow.contrib.operators.dataproc_operator import DataprocWorkflowTemplateInstantiateOperator from datetime import datetime, timedelta default_args = { 'start_date': datetime(2020, 5, 12) # datetime(2020, 5, 1), } dag = DAG( 'workflow_template', default_args=default_args, schedule_interval="@once") t1 = DataprocWorkflowTemplateInstantiateOperator( task_id="execute", gcp_conn_id='google_cloud_default', project_id='sincere-bongo-264115', region='southamerica-east1', template_id='example', dag=dag)
start_date = datetime(2019, 1, 1) default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': start_date, 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } with DAG(dag_id='simple_dataproc_workflow', default_args=default_args, start_date=start_date, schedule_interval=None) as dag: run_hive = DataprocWorkflowTemplateInstantiateOperator( task_id='RunHiveWorkflow', project_id=PROJECT_ID, region=REGION_ID, template_id=TEMPLATE_ID ) load_data = DummyOperator( task_id='DummyTask' ) load_data >> run_hive
SCHEDULE_INTERVAL = timedelta(minutes=60) START_DATE = datetime.now() - SCHEDULE_INTERVAL # [START composer_simple_define_dag_airflow_1] default_dag_args = { 'start_date': START_DATE, 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), 'project_id': PROJECT_ID } # Define a DAG (directed acyclic graph) of tasks. with models.DAG( # The id you will see in the DAG airflow page 'dataproc_workflow_clouddq', schedule_interval=SCHEDULE_INTERVAL, default_args=default_dag_args ) as dag: start_template_job = DataprocWorkflowTemplateInstantiateOperator( # The task id of your job task_id="dataproc_workflow_clouddq", # The template id of your workflow template_id=TEMPLATE_ID, project_id=PROJECT_ID, # The region for the template # For more info on regions where Dataflow is available see: # https://cloud.google.com/dataflow/docs/resources/locations region=REGION, )