def execute(self, context): hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, poll_sleep=self.poll_sleep) hook.start_template_dataflow(self.job_name, self.dataflow_default_options, self.parameters, self.template)
def execute(self, context): hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) dataflow_options = copy.copy(self.dataflow_default_options) dataflow_options.update(self.options) hook.start_java_dataflow(self.task_id, dataflow_options, self.jar)
def execute(self, context): bucket_helper = GoogleCloudBucketHelper(self.gcp_conn_id, self.delegate_to) self.jar = bucket_helper.google_cloud_to_local(self.jar) hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) dataflow_options = copy.copy(self.dataflow_default_options) dataflow_options.update(self.options) hook.start_java_dataflow(self.task_id, dataflow_options, self.jar)
def execute( self, # Some context about the context: https://bcb.github.io/airflow/execute-context context: Dict[str, Any] # pylint: disable=unused-argument ) -> None: hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, poll_sleep=self.poll_sleep) # In DataflowTemplateOperator, start_template_dataflow has the default append_job_name set to True # so it adds a unique-id to the end of the job name. This overwrites that default argument. hook.start_template_dataflow(self.task_id, self.dataflow_default_options, self.parameters, self.template, append_job_name=False)
def __init__(self, task_run): super(DataFlowJobCtrl, self).__init__(task_run=task_run) self.dataflow_config = task_run.task.beam_engine # type: DataflowConfig gcp_conn_id = self.task_env.conn_id from airflow.contrib.hooks.gcp_dataflow_hook import DataFlowHook self._gcp_dataflow_hook = DataFlowHook( gcp_conn_id=gcp_conn_id, delegate_to=self.task_env.delegate_to) if self.dataflow_config.temp_location: # override sync location with temp_location self.remote_sync_root = self.dataflow_config.temp_location self.current_dataflow_job_id = None
def execute(self, context): """Execute the python dataflow job.""" hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) dataflow_options = self.dataflow_default_options.copy() dataflow_options.update(self.options) # Convert argument names from lowerCamelCase to snake case. camel_to_snake = lambda name: re.sub( r'[A-Z]', lambda x: '_' + x.group(0).lower(), name) formatted_options = { camel_to_snake(key): dataflow_options[key] for key in dataflow_options } hook.start_python_dataflow(self.task_id, formatted_options, self.py_file, self.py_options)
def execute(self, context): """Execute the python dataflow job.""" bucket_helper = GoogleCloudBucketHelper( self.gcp_conn_id, self.delegate_to) self.py_file = bucket_helper.google_cloud_to_local(self.py_file) hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, poll_sleep=self.poll_sleep) dataflow_options = self.dataflow_default_options.copy() dataflow_options.update(self.options) # Convert argument names from lowerCamelCase to snake case. camel_to_snake = lambda name: re.sub( r'[A-Z]', lambda x: '_' + x.group(0).lower(), name) formatted_options = {camel_to_snake(key): dataflow_options[key] for key in dataflow_options} hook.start_python_dataflow( self.job_name, formatted_options, self.py_file, self.py_options)
def execute(self, context): hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, poll_sleep=self.poll_sleep) dataflow_options = copy.copy(self.dataflow_default_options) dataflow_options.update(self.options) is_running = False if self.check_if_running != CheckJobRunning.IgnoreJob: is_running = hook.is_job_dataflow_running(self.job_name, dataflow_options) while is_running and self.check_if_running == CheckJobRunning.WaitForRun: is_running = hook.is_job_dataflow_running(self.job_name, dataflow_options) if not is_running: bucket_helper = GoogleCloudBucketHelper( self.gcp_conn_id, self.delegate_to) self.jar = bucket_helper.google_cloud_to_local(self.jar) hook.start_java_dataflow(self.job_name, dataflow_options, self.jar, self.job_class, True, self.multiple_jobs)
def execute(self, context): bucket_helper = GoogleCloudBucketHelper(self.gcp_conn_id, self.delegate_to) self.jar = bucket_helper.google_cloud_to_local(self.jar) hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, poll_sleep=self.poll_sleep) dataflow_options = copy.copy(self.dataflow_default_options) dataflow_options.update(self.options) # Legacy code for xcom key if 'xcom_key' in dataflow_options: value = context['task_instance'].xcom_pull( key=dataflow_options['xcom_key']) dataflow_options['queryParameters'] = value del dataflow_options['xcom_key'] # Code for xcom_keys (to be implemented sanity check) if self.xcom_element_list is not None: for xcom_element in self.xcom_element_list: # Sanity check:' if any(key in xcom_element for key in ['xcom_key', 'task_id', 'dataflow_par_name']): pulled_xcom_value = \ context['task_instance'].xcom_pull(key=xcom_element['xcom_key'], task_ids=xcom_element['task_id']) dataflow_options[ xcom_element['dataflow_par_name']] = pulled_xcom_value else: raise Exception( "ERROR: one of the fields ['xcom_key', 'task_id', 'dataflow_par_name']" " is not non-existent") print("dataflow_options: ", dataflow_options) hook.start_java_dataflow(self.job_name, dataflow_options, self.jar, self.job_class)
def setUp(self): with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'), new=mock_init): self.dataflow_hook = DataFlowHook(gcp_conn_id='test')