def run_job(self, job_execution): ctx = context.ctx() job = conductor.job_get(ctx, job_execution.job_id) input_source, output_source = job_utils.get_data_sources( job_execution, job) # Updated_job_configs will be a copy of job_execution.job_configs with # any name or uuid references to data_sources resolved to paths # assuming substitution is enabled. # If substitution is not enabled then updated_job_configs will # just be a reference to job_execution.job_configs to avoid a copy. # Additional_sources will be a list of any data_sources found. additional_sources, updated_job_configs = ( job_utils.resolve_data_source_references( job_execution.job_configs)) proxy_configs = updated_job_configs.get('proxy_configs') configs = updated_job_configs.get('configs', {}) for data_source in [input_source, output_source] + additional_sources: if data_source and data_source.type == 'hdfs': h.configure_cluster_for_hdfs(self.cluster, data_source) break hdfs_user = self.get_hdfs_user() # TODO(tmckay): this should probably be "get_namenode" # but that call does not exist in the oozie engine api now. oozie_server = self.get_oozie_server(self.cluster) wf_dir = self._create_hdfs_workflow_dir(oozie_server, job) self._upload_job_files_to_hdfs(oozie_server, wf_dir, job, configs, proxy_configs) wf_xml = workflow_factory.get_workflow_xml(job, self.cluster, updated_job_configs, input_source, output_source, hdfs_user) path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir, wf_xml, hdfs_user) job_params = self._get_oozie_job_params(hdfs_user, path_to_workflow) client = self._get_client() oozie_job_id = client.add_job(x.create_hadoop_xml(job_params), job_execution) job_execution = conductor.job_execution_get(ctx, job_execution.id) if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED: return (None, edp.JOB_STATUS_KILLED, None) client.run_job(job_execution, oozie_job_id) try: status = client.get_job_status(job_execution, oozie_job_id)['status'] except Exception: status = None return (oozie_job_id, status, None)
def test_get_data_sources(self, ds): job, job_exec = _create_all_stack(edp.JOB_TYPE_PIG) job_exec.input_id = 's1' job_exec.output_id = 's2' ds.side_effect = _conductor_data_source_get input_source, output_source = ( job_utils.get_data_sources(job_exec, job)) self.assertEqual('obj_s1', input_source) self.assertEqual('obj_s2', output_source)
def test_get_data_sources_with_null_id(self): configs = {sw.HADOOP_SWIFT_USERNAME: "******", sw.HADOOP_SWIFT_PASSWORD: "******"} configs = {"configs": configs, "args": ["swift://ex/i", "output_path"]} job, job_exec = u.create_job_exec(edp.JOB_TYPE_JAVA, configs) job_exec.input_id = None job_exec.output_id = None input_source, output_source = job_utils.get_data_sources(job_exec, job, {}) self.assertIsNone(input_source) self.assertIsNone(output_source)
def test_get_data_sources(self, ds): def _conductor_data_source_get(ctx, id): return mock.Mock(id=id, url="obj_" + id) job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG) job_exec.input_id = "s1" job_exec.output_id = "s2" ds.side_effect = _conductor_data_source_get input_source, output_source = job_utils.get_data_sources(job_exec, job, {}) self.assertEqual("obj_s1", input_source.url) self.assertEqual("obj_s2", output_source.url)
def test_get_data_sources_java(self): configs = { sw.HADOOP_SWIFT_USERNAME: '******', sw.HADOOP_SWIFT_PASSWORD: '******' } configs = {'configs': configs, 'args': ['swift://ex/i', 'output_path']} job, job_exec = u.create_job_exec(edp.JOB_TYPE_JAVA, configs) input_source, output_source = (job_utils.get_data_sources( job_exec, job)) self.assertEqual(None, input_source) self.assertEqual(None, output_source)
def test_get_data_sources(self, ds): def _conductor_data_source_get(ctx, id): return mock.Mock(id=id, url="obj_" + id) job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG) job_exec.input_id = 's1' job_exec.output_id = 's2' ds.side_effect = _conductor_data_source_get input_source, output_source = ( job_utils.get_data_sources(job_exec, job, {})) self.assertEqual('obj_s1', input_source.url) self.assertEqual('obj_s2', output_source.url)
def test_get_data_sources(self, ds): def _conductor_data_source_get(ctx, id): return "obj_" + id job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG) job_exec.input_id = 's1' job_exec.output_id = 's2' ds.side_effect = _conductor_data_source_get input_source, output_source = ( job_utils.get_data_sources(job_exec, job)) self.assertEqual('obj_s1', input_source) self.assertEqual('obj_s2', output_source)
def test_get_data_sources_java(self): configs = {sw.HADOOP_SWIFT_USERNAME: '******', sw.HADOOP_SWIFT_PASSWORD: '******'} configs = { 'configs': configs, 'args': ['swift://ex/i', 'output_path'] } job, job_exec = u.create_job_exec(edp.JOB_TYPE_JAVA, configs) input_source, output_source = ( job_utils.get_data_sources(job_exec, job)) self.assertEqual(None, input_source) self.assertEqual(None, output_source)
def run_job(self, job_execution): ctx = context.ctx() job = conductor.job_get(ctx, job_execution.job_id) input_source, output_source = job_utils.get_data_sources(job_execution, job) proxy_configs = job_execution.job_configs.get('proxy_configs') for data_source in [input_source, output_source]: if data_source and data_source.type == 'hdfs': h.configure_cluster_for_hdfs(self.cluster, data_source) break hdfs_user = self.get_hdfs_user() # TODO(tmckay): this should probably be "get_namenode" # but that call does not exist in the oozie engine api now. oozie_server = self.get_oozie_server(self.cluster) wf_dir = self._create_hdfs_workflow_dir(oozie_server, job) self._upload_job_files_to_hdfs(oozie_server, wf_dir, job, proxy_configs) wf_xml = workflow_factory.get_workflow_xml( job, self.cluster, job_execution, input_source, output_source, hdfs_user) path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir, wf_xml, hdfs_user) job_params = self._get_oozie_job_params(hdfs_user, path_to_workflow) client = self._get_client() oozie_job_id = client.add_job(x.create_hadoop_xml(job_params), job_execution) job_execution = conductor.job_execution_get(ctx, job_execution.id) if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED: return (None, edp.JOB_STATUS_KILLED, None) client.run_job(job_execution, oozie_job_id) try: status = client.get_job_status(job_execution, oozie_job_id)['status'] except Exception: status = None return (oozie_job_id, status, None)
def test_get_data_sources_with_null_id(self): configs = { sw.HADOOP_SWIFT_USERNAME: '******', sw.HADOOP_SWIFT_PASSWORD: '******' } configs = {'configs': configs, 'args': ['swift://ex/i', 'output_path']} job, job_exec = u.create_job_exec(edp.JOB_TYPE_JAVA, configs) job_exec.input_id = None job_exec.output_id = None input_source, output_source = (job_utils.get_data_sources( job_exec, job, {})) self.assertIsNone(input_source) self.assertIsNone(output_source)
def run_job(self, job_execution): ctx = context.ctx() job = conductor.job_get(ctx, job_execution.job_id) input_source, output_source = job_utils.get_data_sources(job_execution, job) for data_source in [input_source, output_source]: if data_source and data_source.type == 'hdfs': h.configure_cluster_for_hdfs(self.cluster, data_source) break hdfs_user = self.plugin.get_hdfs_user() # TODO(tmckay): this should probably be "get_namenode" # but that call does not exist in the plugin api now. # However, other engines may need it. oozie_server = self.plugin.get_oozie_server(self.cluster) wf_dir = job_utils.create_hdfs_workflow_dir(oozie_server, job, hdfs_user) job_utils.upload_job_files_to_hdfs(oozie_server, wf_dir, job, hdfs_user) wf_xml = workflow_factory.get_workflow_xml( job, self.cluster, job_execution, input_source, output_source) path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir, wf_xml, hdfs_user) job_params = self._get_oozie_job_params(hdfs_user, path_to_workflow) client = self._get_client() oozie_job_id = client.add_job(x.create_hadoop_xml(job_params), job_execution) client.run_job(job_execution, oozie_job_id) try: status = client.get_job_status(job_execution, oozie_job_id)['status'] except Exception: status = None return (oozie_job_id, status, None)
def test_get_data_sources_with_null_id(self): configs = {sw.HADOOP_SWIFT_USERNAME: '******', sw.HADOOP_SWIFT_PASSWORD: '******'} configs = { 'configs': configs, 'args': ['swift://ex/i', 'output_path'] } job, job_exec = u.create_job_exec(edp.JOB_TYPE_JAVA, configs) job_exec.input_id = None job_exec.output_id = None input_source, output_source = ( job_utils.get_data_sources(job_exec, job, {})) self.assertIsNone(input_source) self.assertIsNone(output_source)
def run_job(self, job_execution): ctx = context.ctx() data_source_urls = {} job = conductor.job_get(ctx, job_execution.job_id) input_source, output_source = job_utils.get_data_sources(job_execution, job, data_source_urls) # Updated_job_configs will be a copy of job_execution.job_configs with # any name or uuid references to data_sources resolved to paths # assuming substitution is enabled. # If substitution is not enabled then updated_job_configs will # just be a reference to job_execution.job_configs to avoid a copy. # Additional_sources will be a list of any data_sources found. additional_sources, updated_job_configs = job_utils.resolve_data_source_references( job_execution.job_configs, job_execution.id, data_source_urls ) job_execution = conductor.job_execution_update(ctx, job_execution, {"data_source_urls": data_source_urls}) proxy_configs = updated_job_configs.get("proxy_configs") configs = updated_job_configs.get("configs", {}) use_hbase_lib = configs.get("edp.hbase_common_lib", {}) # Extract all the 'oozie.' configs so that they can be set in the # job properties file. These are config values for Oozie itself, # not the job code oozie_params = {} for k in list(configs): if k.startswith("oozie."): oozie_params[k] = configs[k] for data_source in [input_source, output_source] + additional_sources: if data_source and data_source.type == "hdfs": h.configure_cluster_for_hdfs(self.cluster, data_source_urls[data_source.id]) break hdfs_user = self.get_hdfs_user() # TODO(tmckay): this should probably be "get_namenode" # but that call does not exist in the oozie engine api now. oozie_server = self.get_oozie_server(self.cluster) wf_dir = self._create_hdfs_workflow_dir(oozie_server, job) self._upload_job_files_to_hdfs(oozie_server, wf_dir, job, configs, proxy_configs) wf_xml = workflow_factory.get_workflow_xml( job, self.cluster, updated_job_configs, input_source, output_source, hdfs_user, data_source_urls ) path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir, wf_xml, hdfs_user) job_params = self._get_oozie_job_params(hdfs_user, path_to_workflow, oozie_params, use_hbase_lib) client = self._get_client() oozie_job_id = client.add_job(x.create_hadoop_xml(job_params), job_execution) job_execution = conductor.job_execution_get(ctx, job_execution.id) if job_execution.info["status"] == edp.JOB_STATUS_TOBEKILLED: return (None, edp.JOB_STATUS_KILLED, None) client.run_job(job_execution, oozie_job_id) try: status = client.get_job_status(job_execution, oozie_job_id)["status"] except Exception: status = None return (oozie_job_id, status, None)
def _prepare_run_job(self, job_execution): ctx = context.ctx() # This will be a dictionary of tuples, (native_url, runtime_url) # keyed by data_source id data_source_urls = {} prepared_job_params = {} job = conductor.job_get(ctx, job_execution.job_id) input_source, output_source = job_utils.get_data_sources( job_execution, job, data_source_urls, self.cluster) # Updated_job_configs will be a copy of job_execution.job_configs with # any name or uuid references to data_sources resolved to paths # assuming substitution is enabled. # If substitution is not enabled then updated_job_configs will # just be a reference to job_execution.job_configs to avoid a copy. # Additional_sources will be a list of any data_sources found. additional_sources, updated_job_configs = ( job_utils.resolve_data_source_references(job_execution.job_configs, job_execution.id, data_source_urls, self.cluster) ) job_execution = conductor.job_execution_update( ctx, job_execution, {"data_source_urls": job_utils.to_url_dict(data_source_urls)}) # Now that we've recorded the native urls, we can switch to the # runtime urls data_source_urls = job_utils.to_url_dict(data_source_urls, runtime=True) proxy_configs = updated_job_configs.get('proxy_configs') configs = updated_job_configs.get('configs', {}) use_hbase_lib = configs.get('edp.hbase_common_lib', {}) # Extract all the 'oozie.' configs so that they can be set in the # job properties file. These are config values for Oozie itself, # not the job code oozie_params = {} for k in list(configs): if k.startswith('oozie.'): oozie_params[k] = configs[k] for data_source in [input_source, output_source] + additional_sources: if data_source and data_source.type == 'hdfs': h.configure_cluster_for_hdfs( self.cluster, data_source_urls[data_source.id]) break external_hdfs_urls = self._resolve_external_hdfs_urls( job_execution.job_configs) for url in external_hdfs_urls: h.configure_cluster_for_hdfs(self.cluster, url) hdfs_user = self.get_hdfs_user() # TODO(tmckay): this should probably be "get_namenode" # but that call does not exist in the oozie engine api now. oozie_server = self.get_oozie_server(self.cluster) wf_dir = self._create_hdfs_workflow_dir(oozie_server, job) self._upload_job_files_to_hdfs(oozie_server, wf_dir, job, configs, proxy_configs) wf_xml = workflow_factory.get_workflow_xml( job, self.cluster, updated_job_configs, input_source, output_source, hdfs_user, data_source_urls) path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir, wf_xml, hdfs_user) prepared_job_params['context'] = ctx prepared_job_params['hdfs_user'] = hdfs_user prepared_job_params['path_to_workflow'] = path_to_workflow prepared_job_params['use_hbase_lib'] = use_hbase_lib prepared_job_params['job_execution'] = job_execution prepared_job_params['oozie_params'] = oozie_params prepared_job_params['wf_dir'] = wf_dir prepared_job_params['oozie_server'] = oozie_server return prepared_job_params
def run_job(self, job_execution): ctx = context.ctx() # This will be a dictionary of tuples, (native_url, runtime_url) # keyed by data_source id data_source_urls = {} job = conductor.job_get(ctx, job_execution.job_id) input_source, output_source = job_utils.get_data_sources( job_execution, job, data_source_urls, self.cluster) # Updated_job_configs will be a copy of job_execution.job_configs with # any name or uuid references to data_sources resolved to paths # assuming substitution is enabled. # If substitution is not enabled then updated_job_configs will # just be a reference to job_execution.job_configs to avoid a copy. # Additional_sources will be a list of any data_sources found. additional_sources, updated_job_configs = ( job_utils.resolve_data_source_references(job_execution.job_configs, job_execution.id, data_source_urls, self.cluster) ) job_execution = conductor.job_execution_update( ctx, job_execution, {"data_source_urls": job_utils.to_url_dict(data_source_urls)}) # Now that we've recorded the native urls, we can switch to the # runtime urls data_source_urls = job_utils.to_url_dict(data_source_urls, runtime=True) proxy_configs = updated_job_configs.get('proxy_configs') configs = updated_job_configs.get('configs', {}) use_hbase_lib = configs.get('edp.hbase_common_lib', {}) # Extract all the 'oozie.' configs so that they can be set in the # job properties file. These are config values for Oozie itself, # not the job code oozie_params = {} for k in list(configs): if k.startswith('oozie.'): oozie_params[k] = configs[k] for data_source in [input_source, output_source] + additional_sources: if data_source and data_source.type == 'hdfs': h.configure_cluster_for_hdfs( self.cluster, data_source_urls[data_source.id]) break external_hdfs_urls = self._resolve_external_hdfs_urls( job_execution.job_configs) for url in external_hdfs_urls: h.configure_cluster_for_hdfs(self.cluster, url) hdfs_user = self.get_hdfs_user() # TODO(tmckay): this should probably be "get_namenode" # but that call does not exist in the oozie engine api now. oozie_server = self.get_oozie_server(self.cluster) wf_dir = self._create_hdfs_workflow_dir(oozie_server, job) self._upload_job_files_to_hdfs(oozie_server, wf_dir, job, configs, proxy_configs) wf_xml = workflow_factory.get_workflow_xml( job, self.cluster, updated_job_configs, input_source, output_source, hdfs_user, data_source_urls) path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir, wf_xml, hdfs_user) job_params = self._get_oozie_job_params(hdfs_user, path_to_workflow, oozie_params, use_hbase_lib) client = self._get_client() oozie_job_id = client.add_job(x.create_hadoop_xml(job_params), job_execution) job_execution = conductor.job_execution_get(ctx, job_execution.id) if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED: return (None, edp.JOB_STATUS_KILLED, None) conductor.job_execution_update( context.ctx(), job_execution.id, {'info': {'status': edp.JOB_STATUS_READYTORUN}, 'engine_job_id': oozie_job_id}) client.run_job(job_execution, oozie_job_id) try: status = client.get_job_info(job_execution, oozie_job_id)['status'] except Exception: status = None return (oozie_job_id, status, None)
def run_job(self, job_execution): ctx = context.ctx() # This will be a dictionary of tuples, (native_url, runtime_url) # keyed by data_source id data_source_urls = {} job = conductor.job_get(ctx, job_execution.job_id) input_source, output_source = job_utils.get_data_sources( job_execution, job, data_source_urls, self.cluster) # Updated_job_configs will be a copy of job_execution.job_configs with # any name or uuid references to data_sources resolved to paths # assuming substitution is enabled. # If substitution is not enabled then updated_job_configs will # just be a reference to job_execution.job_configs to avoid a copy. # Additional_sources will be a list of any data_sources found. additional_sources, updated_job_configs = ( job_utils.resolve_data_source_references(job_execution.job_configs, job_execution.id, data_source_urls, self.cluster) ) job_execution = conductor.job_execution_update( ctx, job_execution, {"data_source_urls": job_utils.to_url_dict(data_source_urls)}) # Now that we've recorded the native urls, we can switch to the # runtime urls data_source_urls = job_utils.to_url_dict(data_source_urls, runtime=True) proxy_configs = updated_job_configs.get('proxy_configs') configs = updated_job_configs.get('configs', {}) use_hbase_lib = configs.get('edp.hbase_common_lib', {}) # Extract all the 'oozie.' configs so that they can be set in the # job properties file. These are config values for Oozie itself, # not the job code oozie_params = {} for k in list(configs): if k.startswith('oozie.'): oozie_params[k] = configs[k] for data_source in [input_source, output_source] + additional_sources: if data_source and data_source.type == 'hdfs': h.configure_cluster_for_hdfs( self.cluster, data_source_urls[data_source.id]) break external_hdfs_urls = self._resolve_external_hdfs_urls( job_execution.job_configs) for url in external_hdfs_urls: h.configure_cluster_for_hdfs(self.cluster, url) hdfs_user = self.get_hdfs_user() # TODO(tmckay): this should probably be "get_namenode" # but that call does not exist in the oozie engine api now. oozie_server = self.get_oozie_server(self.cluster) wf_dir = self._create_hdfs_workflow_dir(oozie_server, job) self._upload_job_files_to_hdfs(oozie_server, wf_dir, job, configs, proxy_configs) wf_xml = workflow_factory.get_workflow_xml( job, self.cluster, updated_job_configs, input_source, output_source, hdfs_user, data_source_urls) path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir, wf_xml, hdfs_user) job_params = self._get_oozie_job_params(hdfs_user, path_to_workflow, oozie_params, use_hbase_lib) client = self._get_client() oozie_job_id = client.add_job(x.create_hadoop_xml(job_params), job_execution) job_execution = conductor.job_execution_get(ctx, job_execution.id) if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED: return (None, edp.JOB_STATUS_KILLED, None) conductor.job_execution_update( context.ctx(), job_execution.id, {'info': {'status': edp.JOB_STATUS_READYTORUN}, 'engine_job_id': oozie_job_id}) client.run_job(job_execution, oozie_job_id) try: status = client.get_job_status(job_execution, oozie_job_id)['status'] except Exception: status = None return (oozie_job_id, status, None)