Пример #1
0
    def execute(self, context):
        hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id,
                            delegate_to=self.delegate_to,
                            poll_sleep=self.poll_sleep)

        hook.start_template_dataflow(self.task_id, self.dataflow_default_options,
                                     self.parameters, self.template)
Пример #2
0
    def execute(self, context):
        hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id,
                            delegate_to=self.delegate_to,
                            poll_sleep=self.poll_sleep)

        hook.start_template_dataflow(self.job_name, self.dataflow_default_options,
                                     self.parameters, self.template)
    def execute(self, context):
        hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to)

        dataflow_options = copy.copy(self.dataflow_default_options)
        dataflow_options.update(self.options)

        hook.start_java_dataflow(self.task_id, dataflow_options, self.jar)
Пример #4
0
    def execute(self, context):
        bucket_helper = GoogleCloudBucketHelper(
            self.gcp_conn_id, self.delegate_to)
        self.jar = bucket_helper.google_cloud_to_local(self.jar)
        hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id,
                            delegate_to=self.delegate_to)

        dataflow_options = copy.copy(self.dataflow_default_options)
        dataflow_options.update(self.options)

        hook.start_java_dataflow(self.task_id, dataflow_options, self.jar)
    def execute(self, context):
        bucket_helper = GoogleCloudBucketHelper(self.gcp_conn_id,
                                                self.delegate_to)
        self.jar = bucket_helper.google_cloud_to_local(self.jar)
        hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id,
                            delegate_to=self.delegate_to)

        dataflow_options = copy.copy(self.dataflow_default_options)
        dataflow_options.update(self.options)

        hook.start_java_dataflow(self.task_id, dataflow_options, self.jar)
    def execute(
            self,
            # Some context about the context: https://bcb.github.io/airflow/execute-context
            context: Dict[str, Any]  # pylint: disable=unused-argument
    ) -> None:
        hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id,
                            delegate_to=self.delegate_to,
                            poll_sleep=self.poll_sleep)

        # In DataflowTemplateOperator,  start_template_dataflow has the default append_job_name set to True
        # so it adds a unique-id to the end of the job name. This overwrites that default argument.
        hook.start_template_dataflow(self.task_id, self.dataflow_default_options,
                                     self.parameters, self.template, append_job_name=False)
class DataFlowTemplateHookTest(unittest.TestCase):

    def setUp(self):
        with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'),
                        new=mock_init):
            self.dataflow_hook = DataFlowHook(gcp_conn_id='test')

    @mock.patch(DATAFLOW_STRING.format('DataFlowHook._start_template_dataflow'))
    def test_start_template_dataflow(self, internal_dataflow_mock):
        self.dataflow_hook.start_template_dataflow(
            task_id=TASK_ID, variables=DATAFLOW_OPTIONS_TEMPLATE, parameters=PARAMETERS,
            dataflow_template=TEMPLATE)
        internal_dataflow_mock.assert_called_once_with(
            mock.ANY, DATAFLOW_OPTIONS_TEMPLATE, PARAMETERS, TEMPLATE)
Пример #8
0
class DataFlowHookTest(unittest.TestCase):

    def setUp(self):
        with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'),
                        new=mock_init):
            self.dataflow_hook = DataFlowHook(gcp_conn_id='test')

    @mock.patch(DATAFLOW_STRING.format('DataFlowHook._start_dataflow'))
    def test_start_python_dataflow(self, internal_dataflow_mock):
        self.dataflow_hook.start_python_dataflow(
            task_id=TASK_ID, variables=OPTIONS,
            dataflow=PY_FILE, py_options=PY_OPTIONS)
        internal_dataflow_mock.assert_called_once_with(
            TASK_ID, OPTIONS, PY_FILE, mock.ANY, ['python'] + PY_OPTIONS)
Пример #9
0
class DataFlowTemplateHookTest(unittest.TestCase):
    def setUp(self):
        with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'),
                        new=mock_init):
            self.dataflow_hook = DataFlowHook(gcp_conn_id='test')

    @mock.patch(DATAFLOW_STRING.format('DataFlowHook._start_template_dataflow')
                )
    def test_start_template_dataflow(self, internal_dataflow_mock):
        self.dataflow_hook.start_template_dataflow(
            job_name=JOB_NAME,
            variables=DATAFLOW_OPTIONS_TEMPLATE,
            parameters=PARAMETERS,
            dataflow_template=TEMPLATE)
        options_with_region = {'region': 'us-central1'}
        options_with_region.update(DATAFLOW_OPTIONS_TEMPLATE)
        internal_dataflow_mock.assert_called_once_with(mock.ANY,
                                                       options_with_region,
                                                       PARAMETERS, TEMPLATE)

    @mock.patch(DATAFLOW_STRING.format('_DataflowJob'))
    @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn'))
    def test_start_template_dataflow_with_runtime_env(self, mock_conn,
                                                      mock_dataflowjob):
        dataflow_options_template = copy.deepcopy(DATAFLOW_OPTIONS_TEMPLATE)
        options_with_runtime_env = copy.deepcopy(RUNTIME_ENV)
        options_with_runtime_env.update(dataflow_options_template)

        dataflowjob_instance = mock_dataflowjob.return_value
        dataflowjob_instance.wait_for_done.return_value = None
        method = (mock_conn.return_value.projects.return_value.locations.
                  return_value.templates.return_value.launch)

        self.dataflow_hook.start_template_dataflow(
            job_name=JOB_NAME,
            variables=options_with_runtime_env,
            parameters=PARAMETERS,
            dataflow_template=TEMPLATE)
        body = {
            "jobName": mock.ANY,
            "parameters": PARAMETERS,
            "environment": RUNTIME_ENV
        }
        method.assert_called_once_with(
            projectId=options_with_runtime_env['project'],
            location='us-central1',
            gcsPath=TEMPLATE,
            body=body,
        )
Пример #10
0
 def execute(self, context):
     """Execute the python dataflow job."""
     hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id,
                         delegate_to=self.delegate_to)
     dataflow_options = self.dataflow_default_options.copy()
     dataflow_options.update(self.options)
     # Convert argument names from lowerCamelCase to snake case.
     camel_to_snake = lambda name: re.sub(
         r'[A-Z]', lambda x: '_' + x.group(0).lower(), name)
     formatted_options = {
         camel_to_snake(key): dataflow_options[key]
         for key in dataflow_options
     }
     hook.start_python_dataflow(self.task_id, formatted_options,
                                self.py_file, self.py_options)
Пример #11
0
    def __init__(self, task_run):
        super(DataFlowJobCtrl, self).__init__(task_run=task_run)
        self.dataflow_config = task_run.task.beam_engine  # type: DataflowConfig

        gcp_conn_id = self.task_env.conn_id

        from airflow.contrib.hooks.gcp_dataflow_hook import DataFlowHook

        self._gcp_dataflow_hook = DataFlowHook(
            gcp_conn_id=gcp_conn_id, delegate_to=self.task_env.delegate_to)
        if self.dataflow_config.temp_location:
            # override sync location with temp_location
            self.remote_sync_root = self.dataflow_config.temp_location

        self.current_dataflow_job_id = None
class DataFlowTemplateHookTest(unittest.TestCase):

    def setUp(self):
        with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'),
                        new=mock_init):
            self.dataflow_hook = DataFlowHook(gcp_conn_id='test')

    @mock.patch(DATAFLOW_STRING.format('DataFlowHook._start_template_dataflow'))
    def test_start_template_dataflow(self, internal_dataflow_mock):
        self.dataflow_hook.start_template_dataflow(
            job_name=JOB_NAME, variables=DATAFLOW_OPTIONS_TEMPLATE, parameters=PARAMETERS,
            dataflow_template=TEMPLATE)
        options_with_region = {'region': 'us-central1'}
        options_with_region.update(DATAFLOW_OPTIONS_TEMPLATE)
        internal_dataflow_mock.assert_called_once_with(
            mock.ANY, options_with_region, PARAMETERS, TEMPLATE)

    @mock.patch(DATAFLOW_STRING.format('_DataflowJob'))
    @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn'))
    def test_start_template_dataflow_with_runtime_env(self, mock_conn, mock_dataflowjob):
        dataflow_options_template = copy.deepcopy(DATAFLOW_OPTIONS_TEMPLATE)
        options_with_runtime_env = copy.deepcopy(RUNTIME_ENV)
        options_with_runtime_env.update(dataflow_options_template)

        dataflowjob_instance = mock_dataflowjob.return_value
        dataflowjob_instance.wait_for_done.return_value = None
        method = (mock_conn.return_value
                  .projects.return_value
                  .locations.return_value
                  .templates.return_value
                  .launch)

        self.dataflow_hook.start_template_dataflow(
            job_name=JOB_NAME,
            variables=options_with_runtime_env,
            parameters=PARAMETERS,
            dataflow_template=TEMPLATE
        )
        body = {"jobName": mock.ANY,
                "parameters": PARAMETERS,
                "environment": RUNTIME_ENV
                }
        method.assert_called_once_with(
            projectId=options_with_runtime_env['project'],
            location='us-central1',
            gcsPath=TEMPLATE,
            body=body,
        )
Пример #13
0
class DataFlowTemplateHookTest(unittest.TestCase):

    def setUp(self):
        with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'),
                        new=mock_init):
            self.dataflow_hook = DataFlowHook(gcp_conn_id='test')

    @mock.patch(DATAFLOW_STRING.format('DataFlowHook._start_template_dataflow'))
    def test_start_template_dataflow(self, internal_dataflow_mock):
        self.dataflow_hook.start_template_dataflow(
            job_name=JOB_NAME, variables=DATAFLOW_OPTIONS_TEMPLATE, parameters=PARAMETERS,
            dataflow_template=TEMPLATE)
        options_with_region = {'region': 'us-central1'}
        options_with_region.update(DATAFLOW_OPTIONS_TEMPLATE)
        internal_dataflow_mock.assert_called_once_with(
            mock.ANY, options_with_region, PARAMETERS, TEMPLATE)
class DataFlowTemplateHookTest(unittest.TestCase):
    def setUp(self):
        with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'),
                        new=mock_init):
            self.dataflow_hook = DataFlowHook(gcp_conn_id='test')

    @mock.patch(DATAFLOW_STRING.format('DataFlowHook._start_template_dataflow')
                )
    def test_start_template_dataflow(self, internal_dataflow_mock):
        self.dataflow_hook.start_template_dataflow(
            task_id=TASK_ID,
            variables=DATAFLOW_OPTIONS_TEMPLATE,
            parameters=PARAMETERS,
            dataflow_template=TEMPLATE)
        internal_dataflow_mock.assert_called_once_with(
            mock.ANY, DATAFLOW_OPTIONS_TEMPLATE, PARAMETERS, TEMPLATE)
Пример #15
0
class DataFlowJobCtrl(ApacheBeamJobCtrl):
    def __init__(self, task_run):
        super(DataFlowJobCtrl, self).__init__(task_run=task_run)
        self.dataflow_config = task_run.task.beam_engine  # type: DataflowConfig

        gcp_conn_id = self.task_env.conn_id

        from airflow.contrib.hooks.gcp_dataflow_hook import DataFlowHook

        self._gcp_dataflow_hook = DataFlowHook(
            gcp_conn_id=gcp_conn_id, delegate_to=self.task_env.delegate_to)
        if self.dataflow_config.temp_location:
            # override sync location with temp_location
            self.remote_sync_root = self.dataflow_config.temp_location

        self.current_dataflow_job_id = None

    def _get_base_options(self):
        options = super(DataFlowJobCtrl, self)._get_base_options()
        dfc = self.dataflow_config

        options.update(dfc.options)

        options.setdefault("runner", dfc.runner)
        options.setdefault("region", dfc.region)
        options.setdefault("project", dfc.project)
        options.setdefault("tempLocation", dfc.temp_location)

        return options

    def _process_dataflow_log(self, msg):
        msg = msg.strip()
        if self.current_dataflow_job_id is None:
            matched_job = _DATAFLOW_ID_REGEXP.search(msg)
            if matched_job:
                self.current_dataflow_job_id = matched_job.group(1)
                logger.info("Found dataflow job id '%s'",
                            self.current_dataflow_job_id)
        logger.info(msg)

    def _run_cmd(self, cmd):
        dfc = self.dataflow_config

        from airflow.contrib.hooks.gcp_dataflow_hook import _DataflowJob

        run_cmd(
            cmd,
            name="dataflow %s" % self.task_run.job_name,
            stdout_handler=self._process_dataflow_log,
        )

        _DataflowJob(
            self._gcp_dataflow_hook.get_conn(),
            dfc.project,
            self.task_run.job_id,
            dfc.region,
            dfc.poll_sleep,
            self.current_dataflow_job_id,
        ).wait_for_done()
Пример #16
0
 def execute(self, context):
     """Execute the python dataflow job."""
     bucket_helper = GoogleCloudBucketHelper(
         self.gcp_conn_id, self.delegate_to)
     self.py_file = bucket_helper.google_cloud_to_local(self.py_file)
     hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id,
                         delegate_to=self.delegate_to)
     dataflow_options = self.dataflow_default_options.copy()
     dataflow_options.update(self.options)
     # Convert argument names from lowerCamelCase to snake case.
     camel_to_snake = lambda name: re.sub(
         r'[A-Z]', lambda x: '_' + x.group(0).lower(), name)
     formatted_options = {camel_to_snake(key): dataflow_options[key]
                          for key in dataflow_options}
     hook.start_python_dataflow(
         self.task_id, formatted_options,
         self.py_file, self.py_options)
Пример #17
0
 def execute(self, context):
     """Execute the python dataflow job."""
     bucket_helper = GoogleCloudBucketHelper(
         self.gcp_conn_id, self.delegate_to)
     self.py_file = bucket_helper.google_cloud_to_local(self.py_file)
     hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id,
                         delegate_to=self.delegate_to,
                         poll_sleep=self.poll_sleep)
     dataflow_options = self.dataflow_default_options.copy()
     dataflow_options.update(self.options)
     # Convert argument names from lowerCamelCase to snake case.
     camel_to_snake = lambda name: re.sub(
         r'[A-Z]', lambda x: '_' + x.group(0).lower(), name)
     formatted_options = {camel_to_snake(key): dataflow_options[key]
                          for key in dataflow_options}
     hook.start_python_dataflow(
         self.job_name, formatted_options,
         self.py_file, self.py_options)
Пример #18
0
class DataFlowHookTest(unittest.TestCase):
    def setUp(self):
        with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'),
                        new=mock_init):
            self.dataflow_hook = DataFlowHook(gcp_conn_id='test')

    @mock.patch(DATAFLOW_STRING.format('DataFlowHook._start_dataflow'))
    def test_start_python_dataflow(self, internal_dataflow_mock):
        self.dataflow_hook.start_python_dataflow(task_id=TASK_ID,
                                                 variables=OPTIONS,
                                                 dataflow=PY_FILE,
                                                 py_options=PY_OPTIONS)
        internal_dataflow_mock.assert_called_once_with(TASK_ID, OPTIONS,
                                                       PY_FILE, mock.ANY,
                                                       ['python'] + PY_OPTIONS)

    @mock.patch('airflow.contrib.hooks.gcp_dataflow_hook._Dataflow.log')
    @mock.patch('subprocess.Popen')
    @mock.patch('select.select')
    def test_dataflow_wait_for_done_logging(self, mock_select, mock_popen,
                                            mock_logging):
        mock_logging.info = MagicMock()
        mock_logging.warning = MagicMock()
        mock_proc = MagicMock()
        mock_proc.stderr = MagicMock()
        mock_proc.stderr.readlines = MagicMock(
            return_value=['test\n', 'error\n'])
        mock_stderr_fd = MagicMock()
        mock_proc.stderr.fileno = MagicMock(return_value=mock_stderr_fd)
        mock_proc_poll = MagicMock()
        mock_select.return_value = [[mock_stderr_fd]]

        def poll_resp_error():
            mock_proc.return_code = 1
            return True

        mock_proc_poll.side_effect = [None, poll_resp_error]
        mock_proc.poll = mock_proc_poll
        mock_popen.return_value = mock_proc
        dataflow = _Dataflow(['test', 'cmd'])
        mock_logging.info.assert_called_with('Running command: %s', 'test cmd')
        self.assertRaises(Exception, dataflow.wait_for_done)
        mock_logging.warning.assert_has_calls([call('test'), call('error')])
class DataFlowHookTest(unittest.TestCase):

    def setUp(self):
        with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'),
                        new=mock_init):
            self.dataflow_hook = DataFlowHook(gcp_conn_id='test')

    @mock.patch(DATAFLOW_STRING.format('DataFlowHook._start_dataflow'))
    def test_start_python_dataflow(self, internal_dataflow_mock):
        self.dataflow_hook.start_python_dataflow(
            task_id=TASK_ID, variables=OPTIONS,
            dataflow=PY_FILE, py_options=PY_OPTIONS)
        internal_dataflow_mock.assert_called_once_with(
            TASK_ID, OPTIONS, PY_FILE, mock.ANY, ['python'] + PY_OPTIONS)

    @mock.patch('airflow.contrib.hooks.gcp_dataflow_hook._Dataflow.log')
    @mock.patch('subprocess.Popen')
    @mock.patch('select.select')
    def test_dataflow_wait_for_done_logging(self, mock_select, mock_popen, mock_logging):
      mock_logging.info = MagicMock()
      mock_logging.warning = MagicMock()
      mock_proc = MagicMock()
      mock_proc.stderr = MagicMock()
      mock_proc.stderr.readlines = MagicMock(return_value=['test\n','error\n'])
      mock_stderr_fd = MagicMock()
      mock_proc.stderr.fileno = MagicMock(return_value=mock_stderr_fd)
      mock_proc_poll = MagicMock()
      mock_select.return_value = [[mock_stderr_fd]]
      def poll_resp_error():
        mock_proc.return_code = 1
        return True
      mock_proc_poll.side_effect=[None, poll_resp_error]
      mock_proc.poll = mock_proc_poll
      mock_popen.return_value = mock_proc
      dataflow = _Dataflow(['test', 'cmd'])
      mock_logging.info.assert_called_with('Running command: %s', 'test cmd')
      self.assertRaises(Exception, dataflow.wait_for_done)
      mock_logging.warning.assert_has_calls([call('test'), call('error')])
    def execute(self, context):
        bucket_helper = GoogleCloudBucketHelper(self.gcp_conn_id,
                                                self.delegate_to)
        self.jar = bucket_helper.google_cloud_to_local(self.jar)
        hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id,
                            delegate_to=self.delegate_to,
                            poll_sleep=self.poll_sleep)

        dataflow_options = copy.copy(self.dataflow_default_options)
        dataflow_options.update(self.options)
        # Legacy code for xcom key
        if 'xcom_key' in dataflow_options:
            value = context['task_instance'].xcom_pull(
                key=dataflow_options['xcom_key'])
            dataflow_options['queryParameters'] = value
            del dataflow_options['xcom_key']

        # Code for xcom_keys (to be implemented sanity check)
        if self.xcom_element_list is not None:
            for xcom_element in self.xcom_element_list:
                # Sanity check:'
                if any(key in xcom_element for key in
                       ['xcom_key', 'task_id', 'dataflow_par_name']):

                    pulled_xcom_value = \
                        context['task_instance'].xcom_pull(key=xcom_element['xcom_key'],
                                                           task_ids=xcom_element['task_id'])
                    dataflow_options[
                        xcom_element['dataflow_par_name']] = pulled_xcom_value
                else:
                    raise Exception(
                        "ERROR: one of  the fields ['xcom_key', 'task_id', 'dataflow_par_name']"
                        " is not non-existent")

        print("dataflow_options: ", dataflow_options)
        hook.start_java_dataflow(self.job_name, dataflow_options, self.jar,
                                 self.job_class)
Пример #21
0
    def execute(self, context):
        hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id,
                            delegate_to=self.delegate_to,
                            poll_sleep=self.poll_sleep)
        dataflow_options = copy.copy(self.dataflow_default_options)
        dataflow_options.update(self.options)
        is_running = False
        if self.check_if_running != CheckJobRunning.IgnoreJob:
            is_running = hook.is_job_dataflow_running(self.job_name, dataflow_options)
            while is_running and self.check_if_running == CheckJobRunning.WaitForRun:
                is_running = hook.is_job_dataflow_running(self.job_name, dataflow_options)

        if not is_running:
            bucket_helper = GoogleCloudBucketHelper(
                self.gcp_conn_id, self.delegate_to)
            self.jar = bucket_helper.google_cloud_to_local(self.jar)
            hook.start_java_dataflow(self.job_name, dataflow_options,
                                     self.jar, self.job_class, True, self.multiple_jobs)
 def setUp(self):
     with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'),
                     new=mock_init):
         self.dataflow_hook = DataFlowHook(gcp_conn_id='test')
class DataFlowPythonHookTest(unittest.TestCase):
    def setUp(self):
        with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'),
                        new=mock_init):
            self.dataflow_hook = DataFlowHook(gcp_conn_id='test')

    @mock.patch(DATAFLOW_STRING.format('uuid.uuid1'))
    @mock.patch(DATAFLOW_STRING.format('_DataflowJob'))
    @mock.patch(DATAFLOW_STRING.format('_Dataflow'))
    @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn'))
    def test_start_python_dataflow(self, mock_conn, mock_dataflow,
                                   mock_dataflowjob, mock_uuid):
        mock_uuid.return_value = MOCK_UUID
        mock_conn.return_value = None
        dataflow_instance = mock_dataflow.return_value
        dataflow_instance.wait_for_done.return_value = None
        dataflowjob_instance = mock_dataflowjob.return_value
        dataflowjob_instance.wait_for_done.return_value = None
        self.dataflow_hook.start_python_dataflow(task_id=TASK_ID,
                                                 variables=DATAFLOW_OPTIONS_PY,
                                                 dataflow=PY_FILE,
                                                 py_options=PY_OPTIONS)
        EXPECTED_CMD = [
            'python', '-m', PY_FILE, '--runner=DataflowRunner',
            '--project=test', '--labels=foo=bar',
            '--staging_location=gs://test/staging',
            '--job_name={}-{}'.format(TASK_ID, MOCK_UUID)
        ]
        self.assertListEqual(sorted(mock_dataflow.call_args[0][0]),
                             sorted(EXPECTED_CMD))

    @mock.patch(DATAFLOW_STRING.format('uuid.uuid1'))
    @mock.patch(DATAFLOW_STRING.format('_DataflowJob'))
    @mock.patch(DATAFLOW_STRING.format('_Dataflow'))
    @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn'))
    def test_start_java_dataflow(self, mock_conn, mock_dataflow,
                                 mock_dataflowjob, mock_uuid):
        mock_uuid.return_value = MOCK_UUID
        mock_conn.return_value = None
        dataflow_instance = mock_dataflow.return_value
        dataflow_instance.wait_for_done.return_value = None
        dataflowjob_instance = mock_dataflowjob.return_value
        dataflowjob_instance.wait_for_done.return_value = None
        self.dataflow_hook.start_java_dataflow(task_id=TASK_ID,
                                               variables=DATAFLOW_OPTIONS_JAVA,
                                               dataflow=JAR_FILE)
        EXPECTED_CMD = [
            'java', '-jar', JAR_FILE, '--runner=DataflowRunner',
            '--project=test', '--stagingLocation=gs://test/staging',
            '--labels={"foo":"bar"}',
            '--jobName={}-{}'.format(TASK_ID, MOCK_UUID)
        ]
        self.assertListEqual(sorted(mock_dataflow.call_args[0][0]),
                             sorted(EXPECTED_CMD))

    @mock.patch('airflow.contrib.hooks.gcp_dataflow_hook._Dataflow.log')
    @mock.patch('subprocess.Popen')
    @mock.patch('select.select')
    def test_dataflow_wait_for_done_logging(self, mock_select, mock_popen,
                                            mock_logging):
        mock_logging.info = MagicMock()
        mock_logging.warning = MagicMock()
        mock_proc = MagicMock()
        mock_proc.stderr = MagicMock()
        mock_proc.stderr.readlines = MagicMock(
            return_value=['test\n', 'error\n'])
        mock_stderr_fd = MagicMock()
        mock_proc.stderr.fileno = MagicMock(return_value=mock_stderr_fd)
        mock_proc_poll = MagicMock()
        mock_select.return_value = [[mock_stderr_fd]]

        def poll_resp_error():
            mock_proc.return_code = 1
            return True

        mock_proc_poll.side_effect = [None, poll_resp_error]
        mock_proc.poll = mock_proc_poll
        mock_popen.return_value = mock_proc
        dataflow = _Dataflow(['test', 'cmd'])
        mock_logging.info.assert_called_with('Running command: %s', 'test cmd')
        self.assertRaises(Exception, dataflow.wait_for_done)
        mock_logging.warning.assert_has_calls([call('test'), call('error')])
Пример #24
0
class DataFlowHookTest(unittest.TestCase):
    def setUp(self):
        with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'),
                        new=mock_init):
            self.dataflow_hook = DataFlowHook(gcp_conn_id='test')

    @mock.patch(DATAFLOW_STRING.format('uuid.uuid4'))
    @mock.patch(DATAFLOW_STRING.format('_DataflowJob'))
    @mock.patch(DATAFLOW_STRING.format('_Dataflow'))
    @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn'))
    def test_start_python_dataflow(self, mock_conn, mock_dataflow,
                                   mock_dataflowjob, mock_uuid):
        mock_uuid.return_value = MOCK_UUID
        mock_conn.return_value = None
        dataflow_instance = mock_dataflow.return_value
        dataflow_instance.wait_for_done.return_value = None
        dataflowjob_instance = mock_dataflowjob.return_value
        dataflowjob_instance.wait_for_done.return_value = None
        self.dataflow_hook.start_python_dataflow(job_name=JOB_NAME,
                                                 variables=DATAFLOW_OPTIONS_PY,
                                                 dataflow=PY_FILE,
                                                 py_options=PY_OPTIONS)
        EXPECTED_CMD = [
            'python2', '-m', PY_FILE, '--region=us-central1',
            '--runner=DataflowRunner', '--project=test', '--labels=foo=bar',
            '--staging_location=gs://test/staging',
            '--job_name={}-{}'.format(JOB_NAME, MOCK_UUID)
        ]
        self.assertListEqual(sorted(mock_dataflow.call_args[0][0]),
                             sorted(EXPECTED_CMD))

    @mock.patch(DATAFLOW_STRING.format('uuid.uuid4'))
    @mock.patch(DATAFLOW_STRING.format('_DataflowJob'))
    @mock.patch(DATAFLOW_STRING.format('_Dataflow'))
    @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn'))
    def test_start_java_dataflow(self, mock_conn, mock_dataflow,
                                 mock_dataflowjob, mock_uuid):
        mock_uuid.return_value = MOCK_UUID
        mock_conn.return_value = None
        dataflow_instance = mock_dataflow.return_value
        dataflow_instance.wait_for_done.return_value = None
        dataflowjob_instance = mock_dataflowjob.return_value
        dataflowjob_instance.wait_for_done.return_value = None
        self.dataflow_hook.start_java_dataflow(job_name=JOB_NAME,
                                               variables=DATAFLOW_OPTIONS_JAVA,
                                               dataflow=JAR_FILE)
        EXPECTED_CMD = [
            'java', '-jar', JAR_FILE, '--region=us-central1',
            '--runner=DataflowRunner', '--project=test',
            '--stagingLocation=gs://test/staging', '--labels={"foo":"bar"}',
            '--jobName={}-{}'.format(JOB_NAME, MOCK_UUID)
        ]
        self.assertListEqual(sorted(mock_dataflow.call_args[0][0]),
                             sorted(EXPECTED_CMD))

    @mock.patch(DATAFLOW_STRING.format('uuid.uuid4'))
    @mock.patch(DATAFLOW_STRING.format('_DataflowJob'))
    @mock.patch(DATAFLOW_STRING.format('_Dataflow'))
    @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn'))
    def test_start_java_dataflow_with_job_class(self, mock_conn, mock_dataflow,
                                                mock_dataflowjob, mock_uuid):
        mock_uuid.return_value = MOCK_UUID
        mock_conn.return_value = None
        dataflow_instance = mock_dataflow.return_value
        dataflow_instance.wait_for_done.return_value = None
        dataflowjob_instance = mock_dataflowjob.return_value
        dataflowjob_instance.wait_for_done.return_value = None
        self.dataflow_hook.start_java_dataflow(job_name=JOB_NAME,
                                               variables=DATAFLOW_OPTIONS_JAVA,
                                               dataflow=JAR_FILE,
                                               job_class=JOB_CLASS)
        EXPECTED_CMD = [
            'java', '-cp', JAR_FILE, JOB_CLASS, '--region=us-central1',
            '--runner=DataflowRunner', '--project=test',
            '--stagingLocation=gs://test/staging', '--labels={"foo":"bar"}',
            '--jobName={}-{}'.format(JOB_NAME, MOCK_UUID)
        ]
        self.assertListEqual(sorted(mock_dataflow.call_args[0][0]),
                             sorted(EXPECTED_CMD))

    @mock.patch('airflow.contrib.hooks.gcp_dataflow_hook._Dataflow.log')
    @mock.patch('subprocess.Popen')
    @mock.patch('select.select')
    def test_dataflow_wait_for_done_logging(self, mock_select, mock_popen,
                                            mock_logging):
        mock_logging.info = MagicMock()
        mock_logging.warning = MagicMock()
        mock_proc = MagicMock()
        mock_proc.stderr = MagicMock()
        mock_proc.stderr.readlines = MagicMock(
            return_value=['test\n', 'error\n'])
        mock_stderr_fd = MagicMock()
        mock_proc.stderr.fileno = MagicMock(return_value=mock_stderr_fd)
        mock_proc_poll = MagicMock()
        mock_select.return_value = [[mock_stderr_fd]]

        def poll_resp_error():
            mock_proc.return_code = 1
            return True

        mock_proc_poll.side_effect = [None, poll_resp_error]
        mock_proc.poll = mock_proc_poll
        mock_popen.return_value = mock_proc
        dataflow = _Dataflow(['test', 'cmd'])
        mock_logging.info.assert_called_with('Running command: %s', 'test cmd')
        self.assertRaises(Exception, dataflow.wait_for_done)

    def test_valid_dataflow_job_name(self):
        job_name = self.dataflow_hook._build_dataflow_job_name(
            job_name=JOB_NAME, append_job_name=False)

        self.assertEqual(job_name, JOB_NAME)

    def test_fix_underscore_in_job_name(self):
        job_name_with_underscore = 'test_example'
        fixed_job_name = job_name_with_underscore.replace('_', '-')
        job_name = self.dataflow_hook._build_dataflow_job_name(
            job_name=job_name_with_underscore, append_job_name=False)

        self.assertEqual(job_name, fixed_job_name)

    def test_invalid_dataflow_job_name(self):
        invalid_job_name = '9test_invalid_name'
        fixed_name = invalid_job_name.replace('_', '-')

        with self.assertRaises(ValueError) as e:
            self.dataflow_hook._build_dataflow_job_name(
                job_name=invalid_job_name, append_job_name=False)
        #   Test whether the job_name is present in the Error msg
        self.assertIn('Invalid job_name ({})'.format(fixed_name),
                      str(e.exception))

    def test_dataflow_job_regex_check(self):

        self.assertEqual(
            self.dataflow_hook._build_dataflow_job_name(job_name='df-job-1',
                                                        append_job_name=False),
            'df-job-1')

        self.assertEqual(
            self.dataflow_hook._build_dataflow_job_name(job_name='df-job',
                                                        append_job_name=False),
            'df-job')

        self.assertEqual(
            self.dataflow_hook._build_dataflow_job_name(job_name='dfjob',
                                                        append_job_name=False),
            'dfjob')

        self.assertEqual(
            self.dataflow_hook._build_dataflow_job_name(job_name='dfjob1',
                                                        append_job_name=False),
            'dfjob1')

        self.assertRaises(ValueError,
                          self.dataflow_hook._build_dataflow_job_name,
                          job_name='1dfjob',
                          append_job_name=False)

        self.assertRaises(ValueError,
                          self.dataflow_hook._build_dataflow_job_name,
                          job_name='dfjob@',
                          append_job_name=False)

        self.assertRaises(ValueError,
                          self.dataflow_hook._build_dataflow_job_name,
                          job_name='df^jo',
                          append_job_name=False)
Пример #25
0
class DataFlowHookTest(unittest.TestCase):

    def setUp(self):
        with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'),
                        new=mock_init):
            self.dataflow_hook = DataFlowHook(gcp_conn_id='test')

    @mock.patch(DATAFLOW_STRING.format('uuid.uuid1'))
    @mock.patch(DATAFLOW_STRING.format('_DataflowJob'))
    @mock.patch(DATAFLOW_STRING.format('_Dataflow'))
    @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn'))
    def test_start_python_dataflow(self, mock_conn,
                                   mock_dataflow, mock_dataflowjob, mock_uuid):
        mock_uuid.return_value = MOCK_UUID
        mock_conn.return_value = None
        dataflow_instance = mock_dataflow.return_value
        dataflow_instance.wait_for_done.return_value = None
        dataflowjob_instance = mock_dataflowjob.return_value
        dataflowjob_instance.wait_for_done.return_value = None
        self.dataflow_hook.start_python_dataflow(
            task_id=TASK_ID, variables=DATAFLOW_OPTIONS_PY,
            dataflow=PY_FILE, py_options=PY_OPTIONS)
        EXPECTED_CMD = ['python', '-m', PY_FILE,
                        '--runner=DataflowRunner', '--project=test',
                        '--labels=foo=bar',
                        '--staging_location=gs://test/staging',
                        '--job_name={}-{}'.format(TASK_ID, MOCK_UUID)]
        self.assertListEqual(sorted(mock_dataflow.call_args[0][0]),
                             sorted(EXPECTED_CMD))

    @mock.patch(DATAFLOW_STRING.format('uuid.uuid1'))
    @mock.patch(DATAFLOW_STRING.format('_DataflowJob'))
    @mock.patch(DATAFLOW_STRING.format('_Dataflow'))
    @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn'))
    def test_start_java_dataflow(self, mock_conn,
                                 mock_dataflow, mock_dataflowjob, mock_uuid):
        mock_uuid.return_value = MOCK_UUID
        mock_conn.return_value = None
        dataflow_instance = mock_dataflow.return_value
        dataflow_instance.wait_for_done.return_value = None
        dataflowjob_instance = mock_dataflowjob.return_value
        dataflowjob_instance.wait_for_done.return_value = None
        self.dataflow_hook.start_java_dataflow(
            task_id=TASK_ID, variables=DATAFLOW_OPTIONS_JAVA,
            dataflow=JAR_FILE)
        EXPECTED_CMD = ['java', '-jar', JAR_FILE,
                        '--runner=DataflowRunner', '--project=test',
                        '--stagingLocation=gs://test/staging',
                        '--labels={"foo":"bar"}',
                        '--jobName={}-{}'.format(TASK_ID, MOCK_UUID)]
        self.assertListEqual(sorted(mock_dataflow.call_args[0][0]),
                             sorted(EXPECTED_CMD))

    @mock.patch(DATAFLOW_STRING.format('uuid.uuid1'))
    @mock.patch(DATAFLOW_STRING.format('_DataflowJob'))
    @mock.patch(DATAFLOW_STRING.format('_Dataflow'))
    @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn'))
    def test_start_java_dataflow_with_job_class(
            self, mock_conn, mock_dataflow, mock_dataflowjob, mock_uuid):
        mock_uuid.return_value = MOCK_UUID
        mock_conn.return_value = None
        dataflow_instance = mock_dataflow.return_value
        dataflow_instance.wait_for_done.return_value = None
        dataflowjob_instance = mock_dataflowjob.return_value
        dataflowjob_instance.wait_for_done.return_value = None
        self.dataflow_hook.start_java_dataflow(
            task_id=TASK_ID, variables=DATAFLOW_OPTIONS_JAVA,
            dataflow=JAR_FILE, job_class=JOB_CLASS)
        EXPECTED_CMD = ['java', '-cp', JAR_FILE, JOB_CLASS,
                        '--runner=DataflowRunner', '--project=test',
                        '--stagingLocation=gs://test/staging',
                        '--labels={"foo":"bar"}',
                        '--jobName={}-{}'.format(TASK_ID, MOCK_UUID)]
        self.assertListEqual(sorted(mock_dataflow.call_args[0][0]),
                             sorted(EXPECTED_CMD))


    @mock.patch('airflow.contrib.hooks.gcp_dataflow_hook._Dataflow.log')
    @mock.patch('subprocess.Popen')
    @mock.patch('select.select')
    def test_dataflow_wait_for_done_logging(self, mock_select, mock_popen, mock_logging):
      mock_logging.info = MagicMock()
      mock_logging.warning = MagicMock()
      mock_proc = MagicMock()
      mock_proc.stderr = MagicMock()
      mock_proc.stderr.readlines = MagicMock(return_value=['test\n','error\n'])
      mock_stderr_fd = MagicMock()
      mock_proc.stderr.fileno = MagicMock(return_value=mock_stderr_fd)
      mock_proc_poll = MagicMock()
      mock_select.return_value = [[mock_stderr_fd]]
      def poll_resp_error():
        mock_proc.return_code = 1
        return True
      mock_proc_poll.side_effect=[None, poll_resp_error]
      mock_proc.poll = mock_proc_poll
      mock_popen.return_value = mock_proc
      dataflow = _Dataflow(['test', 'cmd'])
      mock_logging.info.assert_called_with('Running command: %s', 'test cmd')
      self.assertRaises(Exception, dataflow.wait_for_done)
      mock_logging.warning.assert_has_calls([call('test'), call('error')])
 def setUp(self):
     with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'),
                     new=mock_init):
         self.dataflow_hook = DataFlowHook(gcp_conn_id='test')
class DataFlowHookTest(unittest.TestCase):

    def setUp(self):
        with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'),
                        new=mock_init):
            self.dataflow_hook = DataFlowHook(gcp_conn_id='test')

    @mock.patch(DATAFLOW_STRING.format('uuid.uuid1'))
    @mock.patch(DATAFLOW_STRING.format('_DataflowJob'))
    @mock.patch(DATAFLOW_STRING.format('_Dataflow'))
    @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn'))
    def test_start_python_dataflow(self, mock_conn,
                                   mock_dataflow, mock_dataflowjob, mock_uuid):
        mock_uuid.return_value = MOCK_UUID
        mock_conn.return_value = None
        dataflow_instance = mock_dataflow.return_value
        dataflow_instance.wait_for_done.return_value = None
        dataflowjob_instance = mock_dataflowjob.return_value
        dataflowjob_instance.wait_for_done.return_value = None
        self.dataflow_hook.start_python_dataflow(
            task_id=TASK_ID, variables=DATAFLOW_OPTIONS_PY,
            dataflow=PY_FILE, py_options=PY_OPTIONS)
        EXPECTED_CMD = ['python', '-m', PY_FILE,
                        '--region=us-central1',
                        '--runner=DataflowRunner', '--project=test',
                        '--labels=foo=bar',
                        '--staging_location=gs://test/staging',
                        '--job_name={}-{}'.format(TASK_ID, MOCK_UUID)]
        self.assertListEqual(sorted(mock_dataflow.call_args[0][0]),
                             sorted(EXPECTED_CMD))

    @mock.patch(DATAFLOW_STRING.format('uuid.uuid1'))
    @mock.patch(DATAFLOW_STRING.format('_DataflowJob'))
    @mock.patch(DATAFLOW_STRING.format('_Dataflow'))
    @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn'))
    def test_start_java_dataflow(self, mock_conn,
                                 mock_dataflow, mock_dataflowjob, mock_uuid):
        mock_uuid.return_value = MOCK_UUID
        mock_conn.return_value = None
        dataflow_instance = mock_dataflow.return_value
        dataflow_instance.wait_for_done.return_value = None
        dataflowjob_instance = mock_dataflowjob.return_value
        dataflowjob_instance.wait_for_done.return_value = None
        self.dataflow_hook.start_java_dataflow(
            task_id=TASK_ID, variables=DATAFLOW_OPTIONS_JAVA,
            dataflow=JAR_FILE)
        EXPECTED_CMD = ['java', '-jar', JAR_FILE,
                        '--region=us-central1',
                        '--runner=DataflowRunner', '--project=test',
                        '--stagingLocation=gs://test/staging',
                        '--labels={"foo":"bar"}',
                        '--jobName={}-{}'.format(TASK_ID, MOCK_UUID)]
        self.assertListEqual(sorted(mock_dataflow.call_args[0][0]),
                             sorted(EXPECTED_CMD))

    @mock.patch(DATAFLOW_STRING.format('uuid.uuid1'))
    @mock.patch(DATAFLOW_STRING.format('_DataflowJob'))
    @mock.patch(DATAFLOW_STRING.format('_Dataflow'))
    @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn'))
    def test_start_java_dataflow_with_job_class(
            self, mock_conn, mock_dataflow, mock_dataflowjob, mock_uuid):
        mock_uuid.return_value = MOCK_UUID
        mock_conn.return_value = None
        dataflow_instance = mock_dataflow.return_value
        dataflow_instance.wait_for_done.return_value = None
        dataflowjob_instance = mock_dataflowjob.return_value
        dataflowjob_instance.wait_for_done.return_value = None
        self.dataflow_hook.start_java_dataflow(
            task_id=TASK_ID, variables=DATAFLOW_OPTIONS_JAVA,
            dataflow=JAR_FILE, job_class=JOB_CLASS)
        EXPECTED_CMD = ['java', '-cp', JAR_FILE, JOB_CLASS,
                        '--region=us-central1',
                        '--runner=DataflowRunner', '--project=test',
                        '--stagingLocation=gs://test/staging',
                        '--labels={"foo":"bar"}',
                        '--jobName={}-{}'.format(TASK_ID, MOCK_UUID)]
        self.assertListEqual(sorted(mock_dataflow.call_args[0][0]),
                             sorted(EXPECTED_CMD))


    @mock.patch('airflow.contrib.hooks.gcp_dataflow_hook._Dataflow.log')
    @mock.patch('subprocess.Popen')
    @mock.patch('select.select')
    def test_dataflow_wait_for_done_logging(self, mock_select, mock_popen, mock_logging):
      mock_logging.info = MagicMock()
      mock_logging.warning = MagicMock()
      mock_proc = MagicMock()
      mock_proc.stderr = MagicMock()
      mock_proc.stderr.readlines = MagicMock(return_value=['test\n','error\n'])
      mock_stderr_fd = MagicMock()
      mock_proc.stderr.fileno = MagicMock(return_value=mock_stderr_fd)
      mock_proc_poll = MagicMock()
      mock_select.return_value = [[mock_stderr_fd]]
      def poll_resp_error():
        mock_proc.return_code = 1
        return True
      mock_proc_poll.side_effect=[None, poll_resp_error]
      mock_proc.poll = mock_proc_poll
      mock_popen.return_value = mock_proc
      dataflow = _Dataflow(['test', 'cmd'])
      mock_logging.info.assert_called_with('Running command: %s', 'test cmd')
      self.assertRaises(Exception, dataflow.wait_for_done)
      mock_logging.warning.assert_has_calls([call('test'), call('error')])

    def test_valid_dataflow_job_name(self):
        job_name = self.dataflow_hook._build_dataflow_job_name(
            task_id=TASK_ID, append_job_name=False
        )

        self.assertEquals(job_name, TASK_ID)

    def test_fix_underscore_in_task_id(self):
        task_id_with_underscore = 'test_example'
        fixed_job_name = task_id_with_underscore.replace(
            '_', '-'
        )
        job_name = self.dataflow_hook._build_dataflow_job_name(
            task_id=task_id_with_underscore, append_job_name=False
        )

        self.assertEquals(job_name, fixed_job_name)

    def test_invalid_dataflow_job_name(self):
        invalid_job_name = '9test_invalid_name'
        fixed_name = invalid_job_name.replace(
            '_', '-')

        with self.assertRaises(AssertionError) as e:
            self.dataflow_hook._build_dataflow_job_name(
                task_id=invalid_job_name, append_job_name=False
            )
        #   Test whether the job_name is present in the Error msg
        self.assertIn('Invalid job_name ({})'.format(fixed_name),
                      str(e.exception))

    def test_dataflow_job_regex_check(self):

        self.assertEquals(self.dataflow_hook._build_dataflow_job_name(
            task_id='df-job-1', append_job_name=False
        ), 'df-job-1')

        self.assertEquals(self.dataflow_hook._build_dataflow_job_name(
            task_id='df-job', append_job_name=False
        ), 'df-job')

        self.assertEquals(self.dataflow_hook._build_dataflow_job_name(
            task_id='dfjob', append_job_name=False
        ), 'dfjob')

        self.assertEquals(self.dataflow_hook._build_dataflow_job_name(
            task_id='dfjob1', append_job_name=False
        ), 'dfjob1')

        self.assertRaises(
            AssertionError,
            self.dataflow_hook._build_dataflow_job_name,
            task_id='1dfjob', append_job_name=False
        )

        self.assertRaises(
            AssertionError,
            self.dataflow_hook._build_dataflow_job_name,
            task_id='dfjob@', append_job_name=False
        )

        self.assertRaises(
            AssertionError,
            self.dataflow_hook._build_dataflow_job_name,
            task_id='df^jo', append_job_name=False
        )