Exemplo n.º 1
0
    def execute(self, context):
        hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id,
                            delegate_to=self.delegate_to,
                            poll_sleep=self.poll_sleep)

        hook.start_template_dataflow(self.job_name, self.dataflow_default_options,
                                     self.parameters, self.template)
    def execute(self, context):
        hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to)

        dataflow_options = copy.copy(self.dataflow_default_options)
        dataflow_options.update(self.options)

        hook.start_java_dataflow(self.task_id, dataflow_options, self.jar)
    def execute(self, context):
        bucket_helper = GoogleCloudBucketHelper(self.gcp_conn_id,
                                                self.delegate_to)
        self.jar = bucket_helper.google_cloud_to_local(self.jar)
        hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id,
                            delegate_to=self.delegate_to)

        dataflow_options = copy.copy(self.dataflow_default_options)
        dataflow_options.update(self.options)

        hook.start_java_dataflow(self.task_id, dataflow_options, self.jar)
    def execute(
            self,
            # Some context about the context: https://bcb.github.io/airflow/execute-context
            context: Dict[str, Any]  # pylint: disable=unused-argument
    ) -> None:
        hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id,
                            delegate_to=self.delegate_to,
                            poll_sleep=self.poll_sleep)

        # In DataflowTemplateOperator,  start_template_dataflow has the default append_job_name set to True
        # so it adds a unique-id to the end of the job name. This overwrites that default argument.
        hook.start_template_dataflow(self.task_id, self.dataflow_default_options,
                                     self.parameters, self.template, append_job_name=False)
Exemplo n.º 5
0
    def __init__(self, task_run):
        super(DataFlowJobCtrl, self).__init__(task_run=task_run)
        self.dataflow_config = task_run.task.beam_engine  # type: DataflowConfig

        gcp_conn_id = self.task_env.conn_id

        from airflow.contrib.hooks.gcp_dataflow_hook import DataFlowHook

        self._gcp_dataflow_hook = DataFlowHook(
            gcp_conn_id=gcp_conn_id, delegate_to=self.task_env.delegate_to)
        if self.dataflow_config.temp_location:
            # override sync location with temp_location
            self.remote_sync_root = self.dataflow_config.temp_location

        self.current_dataflow_job_id = None
Exemplo n.º 6
0
 def execute(self, context):
     """Execute the python dataflow job."""
     hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id,
                         delegate_to=self.delegate_to)
     dataflow_options = self.dataflow_default_options.copy()
     dataflow_options.update(self.options)
     # Convert argument names from lowerCamelCase to snake case.
     camel_to_snake = lambda name: re.sub(
         r'[A-Z]', lambda x: '_' + x.group(0).lower(), name)
     formatted_options = {
         camel_to_snake(key): dataflow_options[key]
         for key in dataflow_options
     }
     hook.start_python_dataflow(self.task_id, formatted_options,
                                self.py_file, self.py_options)
Exemplo n.º 7
0
 def execute(self, context):
     """Execute the python dataflow job."""
     bucket_helper = GoogleCloudBucketHelper(
         self.gcp_conn_id, self.delegate_to)
     self.py_file = bucket_helper.google_cloud_to_local(self.py_file)
     hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id,
                         delegate_to=self.delegate_to,
                         poll_sleep=self.poll_sleep)
     dataflow_options = self.dataflow_default_options.copy()
     dataflow_options.update(self.options)
     # Convert argument names from lowerCamelCase to snake case.
     camel_to_snake = lambda name: re.sub(
         r'[A-Z]', lambda x: '_' + x.group(0).lower(), name)
     formatted_options = {camel_to_snake(key): dataflow_options[key]
                          for key in dataflow_options}
     hook.start_python_dataflow(
         self.job_name, formatted_options,
         self.py_file, self.py_options)
Exemplo n.º 8
0
    def execute(self, context):
        hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id,
                            delegate_to=self.delegate_to,
                            poll_sleep=self.poll_sleep)
        dataflow_options = copy.copy(self.dataflow_default_options)
        dataflow_options.update(self.options)
        is_running = False
        if self.check_if_running != CheckJobRunning.IgnoreJob:
            is_running = hook.is_job_dataflow_running(self.job_name, dataflow_options)
            while is_running and self.check_if_running == CheckJobRunning.WaitForRun:
                is_running = hook.is_job_dataflow_running(self.job_name, dataflow_options)

        if not is_running:
            bucket_helper = GoogleCloudBucketHelper(
                self.gcp_conn_id, self.delegate_to)
            self.jar = bucket_helper.google_cloud_to_local(self.jar)
            hook.start_java_dataflow(self.job_name, dataflow_options,
                                     self.jar, self.job_class, True, self.multiple_jobs)
    def execute(self, context):
        bucket_helper = GoogleCloudBucketHelper(self.gcp_conn_id,
                                                self.delegate_to)
        self.jar = bucket_helper.google_cloud_to_local(self.jar)
        hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id,
                            delegate_to=self.delegate_to,
                            poll_sleep=self.poll_sleep)

        dataflow_options = copy.copy(self.dataflow_default_options)
        dataflow_options.update(self.options)
        # Legacy code for xcom key
        if 'xcom_key' in dataflow_options:
            value = context['task_instance'].xcom_pull(
                key=dataflow_options['xcom_key'])
            dataflow_options['queryParameters'] = value
            del dataflow_options['xcom_key']

        # Code for xcom_keys (to be implemented sanity check)
        if self.xcom_element_list is not None:
            for xcom_element in self.xcom_element_list:
                # Sanity check:'
                if any(key in xcom_element for key in
                       ['xcom_key', 'task_id', 'dataflow_par_name']):

                    pulled_xcom_value = \
                        context['task_instance'].xcom_pull(key=xcom_element['xcom_key'],
                                                           task_ids=xcom_element['task_id'])
                    dataflow_options[
                        xcom_element['dataflow_par_name']] = pulled_xcom_value
                else:
                    raise Exception(
                        "ERROR: one of  the fields ['xcom_key', 'task_id', 'dataflow_par_name']"
                        " is not non-existent")

        print("dataflow_options: ", dataflow_options)
        hook.start_java_dataflow(self.job_name, dataflow_options, self.jar,
                                 self.job_class)
 def setUp(self):
     with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'),
                     new=mock_init):
         self.dataflow_hook = DataFlowHook(gcp_conn_id='test')