示例#1
0
    def test_get_input_output_data_sources(self, ds):
        def _conductor_data_source_get(ctx, id):
            return mock.Mock(id=id, url="hdfs://obj_" + id, type='hdfs')

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG)

        job_exec.input_id = 's1'
        job_exec.output_id = 's2'

        ds.side_effect = _conductor_data_source_get
        input_source, output_source = (job_utils.get_input_output_data_sources(
            job_exec, job, {}))

        self.assertEqual('hdfs://obj_s1', input_source.url)
        self.assertEqual('hdfs://obj_s2', output_source.url)
示例#2
0
    def test_get_input_output_data_sources(self, ds):
        def _conductor_data_source_get(ctx, id):
            return mock.Mock(id=id, url="hdfs://obj_" + id, type='hdfs')

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG)

        job_exec.input_id = 's1'
        job_exec.output_id = 's2'

        ds.side_effect = _conductor_data_source_get
        input_source, output_source = (
            job_utils.get_input_output_data_sources(job_exec, job, {}))

        self.assertEqual('hdfs://obj_s1', input_source.url)
        self.assertEqual('hdfs://obj_s2', output_source.url)
示例#3
0
    def test_get_input_output_data_sources_with_null_id(self):
        configs = {
            sw.HADOOP_SWIFT_USERNAME: '******',
            sw.HADOOP_SWIFT_PASSWORD: '******'
        }

        configs = {'configs': configs, 'args': ['hdfs://ex/i', 'output_path']}

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_JAVA, configs)

        job_exec.input_id = None
        job_exec.output_id = None

        input_source, output_source = (job_utils.get_input_output_data_sources(
            job_exec, job, {}))

        self.assertIsNone(input_source)
        self.assertIsNone(output_source)
示例#4
0
    def test_get_input_output_data_sources_with_null_id(self):
        configs = {sw.HADOOP_SWIFT_USERNAME: '******',
                   sw.HADOOP_SWIFT_PASSWORD: '******'}

        configs = {
            'configs': configs,
            'args': ['hdfs://ex/i',
                     'output_path']
        }

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_JAVA, configs)

        job_exec.input_id = None
        job_exec.output_id = None

        input_source, output_source = (
            job_utils.get_input_output_data_sources(job_exec, job, {}))

        self.assertIsNone(input_source)
        self.assertIsNone(output_source)
示例#5
0
文件: engine.py 项目: madar010/mad
    def _prepare_run_job(self, job_execution):
        ctx = context.ctx()

        # This will be a dictionary of tuples, (native_url, runtime_url)
        # keyed by data_source id
        data_source_urls = {}

        prepared_job_params = {}

        job = conductor.job_get(ctx, job_execution.job_id)

        input_source, output_source = job_utils.get_input_output_data_sources(
            job_execution, job, data_source_urls, self.cluster)

        # Updated_job_configs will be a copy of job_execution.job_configs with
        # any name or uuid references to data_sources resolved to paths
        # assuming substitution is enabled.
        # If substitution is not enabled then updated_job_configs will
        # just be a reference to job_execution.job_configs to avoid a copy.
        # Additional_sources will be a list of any data_sources found.
        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(job_execution.job_configs,
                                                     job_execution.id,
                                                     data_source_urls,
                                                     self.cluster)
        )

        job_execution = conductor.job_execution_update(
            ctx, job_execution,
            {"data_source_urls": job_utils.to_url_dict(data_source_urls)})

        # Now that we've recorded the native urls, we can switch to the
        # runtime urls
        data_source_urls = job_utils.to_url_dict(data_source_urls,
                                                 runtime=True)

        data_sources = additional_sources + [input_source, output_source]
        job_utils.prepare_cluster_for_ds(data_sources,
                                         self.cluster, updated_job_configs,
                                         data_source_urls)

        proxy_configs = updated_job_configs.get('proxy_configs')
        configs = updated_job_configs.get('configs', {})
        use_hbase_lib = configs.get('edp.hbase_common_lib', {})

        # Extract all the 'oozie.' configs so that they can be set in the
        # job properties file. These are config values for Oozie itself,
        # not the job code
        oozie_params = {}
        for k in list(configs):
            if k.startswith('oozie.'):
                oozie_params[k] = configs[k]

        external_hdfs_urls = self._resolve_external_hdfs_urls(
            job_execution.job_configs)
        for url in external_hdfs_urls:
            h.configure_cluster_for_hdfs(self.cluster, url)

        hdfs_user = self.get_hdfs_user()

        # TODO(tmckay): this should probably be "get_namenode"
        # but that call does not exist in the oozie engine api now.
        oozie_server = self.get_oozie_server(self.cluster)

        wf_dir = self._create_hdfs_workflow_dir(oozie_server, job)
        self._upload_job_files_to_hdfs(oozie_server, wf_dir, job, configs,
                                       proxy_configs)

        wf_xml = workflow_factory.get_workflow_xml(
            job, self.cluster, updated_job_configs,
            input_source, output_source,
            hdfs_user, data_source_urls)

        path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir,
                                                      wf_xml, hdfs_user)

        prepared_job_params['context'] = ctx
        prepared_job_params['hdfs_user'] = hdfs_user
        prepared_job_params['path_to_workflow'] = path_to_workflow
        prepared_job_params['use_hbase_lib'] = use_hbase_lib
        prepared_job_params['job_execution'] = job_execution
        prepared_job_params['oozie_params'] = oozie_params
        prepared_job_params['wf_dir'] = wf_dir
        prepared_job_params['oozie_server'] = oozie_server

        return prepared_job_params