def setUp(self): self.dataflow = DataflowTemplatedJobStartOperator( task_id=TASK_ID, template=TEMPLATE, job_name=JOB_NAME, parameters=PARAMETERS, dataflow_default_options=DEFAULT_OPTIONS_TEMPLATE, poll_sleep=POLL_SLEEP)
def setUp(self): self.dataflow = DataflowTemplatedJobStartOperator( task_id=TASK_ID, template=TEMPLATE, job_name=JOB_NAME, parameters=PARAMETERS, options=DEFAULT_OPTIONS_TEMPLATE, dataflow_default_options={"EXTRA_OPTION": "TEST_A"}, poll_sleep=POLL_SLEEP, location=TEST_LOCATION, environment={"maxWorkers": 2}, )
class TestDataflowTemplateOperator(unittest.TestCase): def setUp(self): self.dataflow = DataflowTemplatedJobStartOperator( task_id=TASK_ID, template=TEMPLATE, job_name=JOB_NAME, parameters=PARAMETERS, dataflow_default_options=DEFAULT_OPTIONS_TEMPLATE, poll_sleep=POLL_SLEEP, location=TEST_LOCATION) def test_init(self): """Test DataflowTemplateOperator instance is properly initialized.""" self.assertEqual(self.dataflow.task_id, TASK_ID) self.assertEqual(self.dataflow.job_name, JOB_NAME) self.assertEqual(self.dataflow.template, TEMPLATE) self.assertEqual(self.dataflow.parameters, PARAMETERS) self.assertEqual(self.dataflow.poll_sleep, POLL_SLEEP) self.assertEqual(self.dataflow.dataflow_default_options, DEFAULT_OPTIONS_TEMPLATE) @mock.patch( 'airflow.providers.google.cloud.operators.dataflow.DataflowHook') def test_exec(self, dataflow_mock): """Test DataflowHook is created and the right args are passed to start_template_workflow. """ start_template_hook = dataflow_mock.return_value.start_template_dataflow self.dataflow.execute(None) self.assertTrue(dataflow_mock.called) expected_options = { 'project': 'test', 'stagingLocation': 'gs://test/staging', 'tempLocation': 'gs://test/temp', 'zone': 'us-central1-f' } start_template_hook.assert_called_once_with( job_name=JOB_NAME, variables=expected_options, parameters=PARAMETERS, dataflow_template=TEMPLATE, on_new_job_id_callback=mock.ANY, project_id=None, location=TEST_LOCATION)
class TestDataflowTemplateOperator(unittest.TestCase): def setUp(self): self.dataflow = DataflowTemplatedJobStartOperator( task_id=TASK_ID, template=TEMPLATE, job_name=JOB_NAME, parameters=PARAMETERS, options=DEFAULT_OPTIONS_TEMPLATE, dataflow_default_options={"EXTRA_OPTION": "TEST_A"}, poll_sleep=POLL_SLEEP, location=TEST_LOCATION, environment={"maxWorkers": 2}, ) @mock.patch( 'airflow.providers.google.cloud.operators.dataflow.DataflowHook') def test_exec(self, dataflow_mock): """Test DataflowHook is created and the right args are passed to start_template_workflow. """ start_template_hook = dataflow_mock.return_value.start_template_dataflow self.dataflow.execute(None) self.assertTrue(dataflow_mock.called) expected_options = { 'project': 'test', 'stagingLocation': 'gs://test/staging', 'tempLocation': 'gs://test/temp', 'zone': 'us-central1-f', 'EXTRA_OPTION': "TEST_A", } start_template_hook.assert_called_once_with( job_name=JOB_NAME, variables=expected_options, parameters=PARAMETERS, dataflow_template=TEMPLATE, on_new_job_id_callback=mock.ANY, project_id=None, location=TEST_LOCATION, environment={'maxWorkers': 2}, )
job_id= "{{task_instance.xcom_pull('start-python-job-async')['job_id']}}", location='europe-west3', callback=check_autoscaling_event, ) # [END howto_sensor_wait_for_job_autoscaling_event] start_python_job_async >> wait_for_python_job_async_done start_python_job_async >> wait_for_python_job_async_metric start_python_job_async >> wait_for_python_job_async_message start_python_job_async >> wait_for_python_job_async_autoscaling_event with models.DAG( "example_gcp_dataflow_template", default_args=default_args, start_date=days_ago(1), schedule_interval=None, # Override to match your needs tags=['example'], ) as dag_template: # [START howto_operator_start_template_job] start_template_job = DataflowTemplatedJobStartOperator( task_id="start-template-job", template='gs://dataflow-templates/latest/Word_Count', parameters={ 'inputFile': "gs://dataflow-samples/shakespeare/kinglear.txt", 'output': GCS_OUTPUT }, location='europe-west3', ) # [END howto_operator_start_template_job]
with models.DAG( # The id you will see in the DAG airflow page "composer_dataflow_dag", default_args=default_args, # The interval with which to schedule the DAG schedule_interval=datetime.timedelta( days=1), # Override to match your needs ) as dag: start_template_job = DataflowTemplatedJobStartOperator( # The task id of your job task_id="dataflow_operator_transform_csv_to_bq", # The name of the template that you're using. # Below is a list of all the templates you can use. # For versions in non-production environments, use the subfolder 'latest' # https://cloud.google.com/dataflow/docs/guides/templates/provided-batch#gcstexttobigquery template="gs://dataflow-templates/latest/GCS_Text_to_BigQuery", # Use the link above to specify the correct parameters for your template. parameters={ "javascriptTextTransformFunctionName": "transformCSVtoJSON", "JSONPath": bucket_path + "/jsonSchema.json", "javascriptTextTransformGcsPath": bucket_path + "/transformCSVtoJSON.js", "inputFilePattern": bucket_path + "/inputFile.txt", "outputTable": project_id + ":average_weather.average_weather", "bigQueryLoadingTemporaryDirectory": bucket_path + "/tmp/", }, ) # [END composer_dataflow_dag]
default_args=default_args, schedule_interval=None) as dag: start = dummy.DummyOperator(task_id='start', trigger_rule='all_success') end = dummy.DummyOperator(task_id='end', trigger_rule='all_success') # Bigquery Tables automatically created for demo porpuse. # Consider a dedicated pipeline or tool for a real life scenario. customers_import = DataflowTemplatedJobStartOperator( task_id="dataflow_customers_import", template="gs://dataflow-templates/latest/GCS_Text_to_BigQuery", project_id=LOD_PRJ, location=DF_REGION, parameters={ "javascriptTextTransformFunctionName": "transform", "JSONPath": ORC_GCS + "/customers_schema.json", "javascriptTextTransformGcsPath": ORC_GCS + "/customers_udf.js", "inputFilePattern": DRP_GCS + "/customers.csv", "outputTable": DWH_LAND_PRJ + ":" + DWH_LAND_BQ_DATASET + ".customers", "bigQueryLoadingTemporaryDirectory": LOD_GCS_STAGING + "/tmp/bq/", }, ) purchases_import = DataflowTemplatedJobStartOperator( task_id="dataflow_purchases_import", template="gs://dataflow-templates/latest/GCS_Text_to_BigQuery", project_id=LOD_PRJ, location=DF_REGION, parameters={ "javascriptTextTransformFunctionName": "transform",