def test_build_workflow_for_job_java(self): # If args include swift paths, user and password values # will have to be supplied via configs instead of being # lifted from input or output data sources configs = {workflow_factory.swift_username: '******', workflow_factory.swift_password: '******'} configs = { 'configs': configs, 'args': ['input_path', 'output_path'] } job, job_exec = _create_all_stack('Java', configs) creator = workflow_factory.get_creator(job) res = creator.get_workflow_xml(job_exec) self.assertIn(""" <configuration> <property> <name>fs.swift.service.savanna.password</name> <value>admin1</value> </property> <property> <name>fs.swift.service.savanna.username</name> <value>admin</value> </property> </configuration> <main-class>%s</main-class> <java-opts>%s</java-opts> <arg>input_path</arg> <arg>output_path</arg>""" % (_java_main_class, _java_opts), res)
def _build_workflow_with_conf_common(self, job_type): job, _ = _create_all_stack(job_type) input_data = _create_data_source('swift://ex.savanna/i') output_data = _create_data_source('swift://ex.savanna/o') job_exec = _create_job_exec(job.id, job_type, configs={"configs": {'c': 'f'}}) creator = workflow_factory.get_creator(job) res = creator.get_workflow_xml(job_exec, input_data, output_data) self.assertIn(""" <property> <name>c</name> <value>f</value> </property>""", res) self.assertIn(""" <property> <name>mapred.input.dir</name> <value>swift://ex.savanna/i</value> </property>""", res) self.assertIn(""" <property> <name>mapred.output.dir</name> <value>swift://ex.savanna/o</value> </property>""", res)
def test_build_workflow_for_job_pig(self, job_binary): job, job_exec = _create_all_stack('Pig') job_binary.return_value = {"name": "script.pig"} input_data = _create_data_source('swift://ex.savanna/i') output_data = _create_data_source('swift://ex.savanna/o') creator = workflow_factory.get_creator(job) res = creator.get_workflow_xml(job_exec, input_data, output_data) self.assertIn(""" <param>INPUT=swift://ex.savanna/i</param> <param>OUTPUT=swift://ex.savanna/o</param>""", res) self.assertIn(""" <configuration> <property> <name>fs.swift.service.savanna.password</name> <value>admin1</value> </property> <property> <name>fs.swift.service.savanna.username</name> <value>admin</value> </property> </configuration>""", res) self.assertIn("<script>script.pig</script>", res)
def test_build_workflow_for_job_jar_with_conf(self): job, _ = _create_all_stack('Jar') input_data = _create_data_source('swift://ex.savanna/i') output_data = _create_data_source('swift://ex.savanna/o') job_exec = _create_job_exec(job.id, configs={"configs": {'c': 'f'}}) creator = workflow_factory.get_creator(job) res = creator.get_workflow_xml(job_exec.job_configs, input_data, output_data) self.assertIn( """ <property> <name>c</name> <value>f</value> </property>""", res) self.assertIn( """ <property> <name>mapred.input.dir</name> <value>swift://ex.savanna/i</value> </property>""", res) self.assertIn( """ <property> <name>mapred.output.dir</name> <value>swift://ex.savanna/o</value> </property>""", res)
def test_build_workflow_for_job_java(self): # If args include swift paths, user and password values # will have to be supplied via configs instead of being # lifted from input or output data sources configs = {workflow_factory.swift_username: '******', workflow_factory.swift_password: '******'} configs = { 'configs': configs, 'args': ['input_path', 'output_path'] } job, job_exec = _create_all_stack('Java', configs) creator = workflow_factory.get_creator(job) res = creator.get_workflow_xml(job_exec) self.assertIn(""" <configuration> <property> <name>fs.swift.service.savanna.password</name> <value>admin1</value> </property> <property> <name>fs.swift.service.savanna.username</name> <value>admin</value> </property> </configuration> <main-class>%s</main-class> <java-opts>%s</java-opts> <arg>input_path</arg> <arg>output_path</arg>""" % (_java_main_class, _java_opts), res)
def test_build_workflow_for_job_hive(self, job_binary): job, origin = _create_all_stack("Hive") job_exec = _create_job_exec(job.id) job_binary.return_value = {"name": "script.q"} input_data = _create_data_source("swift://ex.savanna/i") output_data = _create_data_source("swift://ex.savanna/o") creator = workflow_factory.get_creator("Hive", origin) res = creator.get_workflow_xml(job_exec.job_configs, input_data, output_data) self.assertIn( """ <job-xml>hive-site.xml</job-xml> <configuration> <property> <name>fs.swift.service.savanna.password</name> <value>admin1</value> </property> <property> <name>fs.swift.service.savanna.username</name> <value>admin</value> </property> </configuration> <script>script.q</script> <param>INPUT=swift://ex.savanna/i</param> <param>OUTPUT=swift://ex.savanna/o</param>""", res, )
def _build_workflow_common(self, job_type): job, job_exec = _create_all_stack(job_type) input_data = _create_data_source('swift://ex.savanna/i') output_data = _create_data_source('swift://ex.savanna/o') creator = workflow_factory.get_creator(job) res = creator.get_workflow_xml(job_exec, input_data, output_data) self.assertIn(""" <property> <name>mapred.output.dir</name> <value>swift://ex.savanna/o</value> </property>""", res) self.assertIn(""" <property> <name>mapred.input.dir</name> <value>swift://ex.savanna/i</value> </property>""", res) self.assertIn(""" <property> <name>fs.swift.service.savanna.password</name> <value>admin1</value> </property>""", res) self.assertIn(""" <property> <name>fs.swift.service.savanna.username</name> <value>admin</value> </property>""", res)
def test_build_workflow_for_job_pig(self, job_binary): job, job_exec = _create_all_stack('Pig') job_binary.return_value = {"name": "script.pig"} input_data = _create_data_source('swift://ex.savanna/i') output_data = _create_data_source('swift://ex.savanna/o') creator = workflow_factory.get_creator(job) res = creator.get_workflow_xml(job_exec.job_configs, input_data, output_data) self.assertIn( """ <param>INPUT=swift://ex.savanna/i</param> <param>OUTPUT=swift://ex.savanna/o</param>""", res) self.assertIn( """ <configuration> <property> <name>fs.swift.service.savanna.password</name> <value>admin1</value> </property> <property> <name>fs.swift.service.savanna.username</name> <value>admin</value> </property> </configuration>""", res) self.assertIn("<script>script.pig</script>", res)
def run_job(job_execution): ctx = context.ctx() cluster = conductor.cluster_get(ctx, job_execution.cluster_id) if cluster.status != 'Active': return job_execution job = conductor.job_get(ctx, job_execution.job_id) input_source = conductor.data_source_get(ctx, job_execution.input_id) output_source = conductor.data_source_get(ctx, job_execution.output_id) #TODO(nprivalova): should be removed after all features implemented validate(input_source, output_source, job) plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name) hdfs_user = plugin.get_hdfs_user() wf_dir = create_workflow_dir(u.get_jobtracker(cluster), job, hdfs_user) upload_job_files(u.get_jobtracker(cluster), wf_dir, job, hdfs_user) creator = workflow_factory.get_creator(job) # Do other job type specific setup here, for example # uploading hive configuration creator.configure_workflow_if_needed(cluster, wf_dir) wf_xml = creator.get_workflow_xml(job_execution.job_configs, input_source, output_source) path_to_workflow = upload_workflow_file(u.get_jobtracker(cluster), wf_dir, wf_xml, hdfs_user) jt_path = cluster['info']['MapReduce']['JobTracker'] nn_path = cluster['info']['HDFS']['NameNode'] client = o.OozieClient(cluster['info']['JobFlow']['Oozie'] + "/oozie/") job_parameters = {"jobTracker": jt_path, "nameNode": nn_path, "user.name": hdfs_user, "oozie.wf.application.path": "%s%s" % (nn_path, path_to_workflow), "oozie.use.system.libpath": "true"} oozie_job_id = client.add_job(x.create_hadoop_xml(job_parameters)) client.run_job(oozie_job_id) job_execution = conductor.job_execution_update(ctx, job_execution, {'oozie_job_id': oozie_job_id, 'start_time': datetime.datetime.now()}) return job_execution
def run_job(job_execution): ctx = context.ctx() cluster = conductor.cluster_get(ctx, job_execution.cluster_id) if cluster.status != 'Active': return job_execution job = conductor.job_get(ctx, job_execution.job_id) input_source = conductor.data_source_get(ctx, job_execution.input_id) output_source = conductor.data_source_get(ctx, job_execution.output_id) #TODO(nprivalova): should be removed after all features implemented validate(input_source, output_source, job) plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name) hdfs_user = plugin.get_hdfs_user() wf_dir = create_workflow_dir(u.get_jobtracker(cluster), job, hdfs_user) upload_job_files(u.get_jobtracker(cluster), wf_dir, job, hdfs_user) creator = workflow_factory.get_creator(job) # Do other job type specific setup here, for example # uploading hive configuration creator.configure_workflow_if_needed(cluster, wf_dir) wf_xml = creator.get_workflow_xml(job_execution.job_configs, input_source, output_source) path_to_workflow = upload_workflow_file(u.get_jobtracker(cluster), wf_dir, wf_xml, hdfs_user) jt_path = '%s:8021' % u.get_jobtracker(cluster).hostname nn_path = 'hdfs://%s:8020' % u.get_namenode(cluster).hostname client = o.OozieClient(cluster['info']['JobFlow']['Oozie'] + "/oozie/") job_parameters = { "jobTracker": jt_path, "nameNode": nn_path, "user.name": "hadoop", "oozie.wf.application.path": "%s%s" % (nn_path, path_to_workflow), "oozie.use.system.libpath": "true" } oozie_job_id = client.add_job(x.create_hadoop_xml(job_parameters)) client.run_job(oozie_job_id) job_execution = conductor.job_execution_update( ctx, job_execution, { 'oozie_job_id': oozie_job_id, 'start_time': datetime.datetime.now() }) return job_execution
def _build_workflow_common(self, job_type, streaming=False): if streaming: configs = {'edp.streaming.mapper': '/usr/bin/cat', 'edp.streaming.reducer': '/usr/bin/wc'} configs = {'configs': configs} else: configs = {} job, job_exec = _create_all_stack(job_type, configs) input_data = _create_data_source('swift://ex.savanna/i') output_data = _create_data_source('swift://ex.savanna/o') creator = workflow_factory.get_creator(job) res = creator.get_workflow_xml(job_exec, input_data, output_data) if streaming: self.assertIn(""" <streaming> <mapper>/usr/bin/cat</mapper> <reducer>/usr/bin/wc</reducer> </streaming>""", res) self.assertIn(""" <property> <name>mapred.output.dir</name> <value>swift://ex.savanna/o</value> </property>""", res) self.assertIn(""" <property> <name>mapred.input.dir</name> <value>swift://ex.savanna/i</value> </property>""", res) self.assertIn(""" <property> <name>fs.swift.service.savanna.password</name> <value>admin1</value> </property>""", res) self.assertIn(""" <property> <name>fs.swift.service.savanna.username</name> <value>admin</value> </property>""", res)
def test_build_workflow_for_job_jar(self): job, origin = _create_all_stack("Jar") job_exec = _create_job_exec(job.id) input_data = _create_data_source("swift://ex.savanna/i") output_data = _create_data_source("swift://ex.savanna/o") creator = workflow_factory.get_creator("Jar", origin) res = creator.get_workflow_xml(job_exec.job_configs, input_data, output_data) self.assertIn( """ <property> <name>mapred.output.dir</name> <value>swift://ex.savanna/o</value> </property>""", res, ) self.assertIn( """ <property> <name>mapred.input.dir</name> <value>swift://ex.savanna/i</value> </property>""", res, ) self.assertIn( """ <property> <name>fs.swift.service.savanna.password</name> <value>admin1</value> </property>""", res, ) self.assertIn( """ <property> <name>fs.swift.service.savanna.username</name> <value>admin</value> </property>""", res, )
def run_job(ctx, job_execution): cluster = conductor.cluster_get(ctx, job_execution.cluster_id) if cluster.status != "Active": return job_execution job = conductor.job_get(ctx, job_execution.job_id) job_origin = conductor.job_origin_get(context.ctx(), job.job_origin_id) input_source = conductor.data_source_get(ctx, job_execution.input_id) output_source = conductor.data_source_get(ctx, job_execution.output_id) # TODO(nprivalova): should be removed after all features implemented validate(input_source, output_source, job) wf_dir = create_workflow_dir(u.get_jobtracker(cluster), job) upload_job_files(u.get_jobtracker(cluster), wf_dir, job_origin) creator = workflow_factory.get_creator(job.type, job_origin) # Do other job type specific setup here, for example # uploading hive configuration creator.configure_workflow_if_needed(cluster, wf_dir) wf_xml = creator.get_workflow_xml(job_execution.job_configs, input_source, output_source) path_to_workflow = upload_workflow_file(u.get_jobtracker(cluster), wf_dir, wf_xml) jt_path = "%s:8021" % u.get_jobtracker(cluster).hostname nn_path = "hdfs://%s:8020" % u.get_namenode(cluster).hostname client = o.OozieClient(cluster["info"]["JobFlow"]["Oozie"] + "/oozie/") job_parameters = { "jobTracker": jt_path, "nameNode": nn_path, "user.name": "hadoop", "oozie.wf.application.path": "%s%s" % (nn_path, path_to_workflow), "oozie.use.system.libpath": "true", } oozie_job_id = client.add_job(x.create_hadoop_xml(job_parameters)) client.run_job(oozie_job_id) job_execution = conductor.job_execution_update( ctx, job_execution, {"oozie_job_id": oozie_job_id, "start_time": datetime.datetime.now()} ) return job_execution
def test_build_workflow_for_job_jar(self): job, job_exec = _create_all_stack('Jar') input_data = _create_data_source('swift://ex.savanna/i') output_data = _create_data_source('swift://ex.savanna/o') creator = workflow_factory.get_creator(job) res = creator.get_workflow_xml(job_exec.job_configs, input_data, output_data) self.assertIn( """ <property> <name>mapred.output.dir</name> <value>swift://ex.savanna/o</value> </property>""", res) self.assertIn( """ <property> <name>mapred.input.dir</name> <value>swift://ex.savanna/i</value> </property>""", res) self.assertIn( """ <property> <name>fs.swift.service.savanna.password</name> <value>admin1</value> </property>""", res) self.assertIn( """ <property> <name>fs.swift.service.savanna.username</name> <value>admin</value> </property>""", res)
def test_build_workflow_for_job_jar_with_conf(self): job, origin = _create_all_stack("Jar") input_data = _create_data_source("swift://ex.savanna/i") output_data = _create_data_source("swift://ex.savanna/o") job_exec = _create_job_exec(job.id, configs={"configs": {"c": "f"}}) creator = workflow_factory.get_creator("Jar", origin) res = creator.get_workflow_xml(job_exec.job_configs, input_data, output_data) self.assertIn( """ <property> <name>c</name> <value>f</value> </property>""", res, ) self.assertIn( """ <property> <name>mapred.input.dir</name> <value>swift://ex.savanna/i</value> </property>""", res, ) self.assertIn( """ <property> <name>mapred.output.dir</name> <value>swift://ex.savanna/o</value> </property>""", res, )
def test_build_workflow_swift_configs(self, job_binary): # Test that swift configs come from either input or output data sources job, job_exec = _create_all_stack('Pig') job_binary.return_value = {"name": "script.pig"} input_data = _create_data_source('swift://ex.savanna/i') output_data = _create_data_source('hdfs://user/hadoop/out') creator = workflow_factory.get_creator(job) res = creator.get_workflow_xml(job_exec, input_data, output_data) self.assertIn(""" <configuration> <property> <name>fs.swift.service.savanna.password</name> <value>admin1</value> </property> <property> <name>fs.swift.service.savanna.username</name> <value>admin</value> </property> </configuration>""", res) input_data = _create_data_source('hdfs://user/hadoop/in') output_data = _create_data_source('swift://ex.savanna/o') creator = workflow_factory.get_creator(job) res = creator.get_workflow_xml(job_exec, input_data, output_data) self.assertIn(""" <configuration> <property> <name>fs.swift.service.savanna.password</name> <value>admin1</value> </property> <property> <name>fs.swift.service.savanna.username</name> <value>admin</value> </property> </configuration>""", res) job, job_exec = _create_all_stack('Pig', configs={'configs': {'dummy': 'value'}}) input_data = _create_data_source('hdfs://user/hadoop/in') output_data = _create_data_source('hdfs://user/hadoop/out') creator = workflow_factory.get_creator(job) res = creator.get_workflow_xml(job_exec, input_data, output_data) self.assertIn(""" <configuration> <property> <name>dummy</name> <value>value</value> </property> </configuration>""", res)
def run_job(job_execution): ctx = context.ctx() cluster = conductor.cluster_get(ctx, job_execution.cluster_id) if cluster.status != 'Active': return job_execution job = conductor.job_get(ctx, job_execution.job_id) if not edp.compare_job_type(job.type, 'Java'): input_source = conductor.data_source_get(ctx, job_execution.input_id) output_source = conductor.data_source_get(ctx, job_execution.output_id) else: input_source = None output_source = None #TODO(nprivalova): should be removed after all features implemented validate(input_source, output_source, job) for data_source in [input_source, output_source]: if data_source and data_source.type == 'hdfs': h.configure_cluster_for_hdfs(cluster, data_source) hdfs_user = _get_hdfs_user(cluster) oozie_server = _get_oozie_server(cluster) wf_dir = create_workflow_dir(oozie_server, job, hdfs_user) upload_job_files(oozie_server, wf_dir, job, hdfs_user) creator = workflow_factory.get_creator(job) # Do other job type specific setup here, for example # uploading hive configuration creator.configure_workflow_if_needed(cluster, wf_dir) wf_xml = creator.get_workflow_xml(job_execution, input_source, output_source) path_to_workflow = upload_workflow_file(oozie_server, wf_dir, wf_xml, hdfs_user) rm_path = _get_resource_manager_path(cluster) nn_path = cluster['info']['HDFS']['NameNode'] client = o.OozieClient(cluster['info']['JobFlow']['Oozie'] + "/oozie/", _get_oozie_server(cluster)) job_parameters = { "jobTracker": rm_path, "nameNode": nn_path, "user.name": hdfs_user, "oozie.wf.application.path": "%s%s" % (nn_path, path_to_workflow), "oozie.use.system.libpath": "true" } oozie_job_id = client.add_job(x.create_hadoop_xml(job_parameters), job_execution) job_execution = conductor.job_execution_update( ctx, job_execution, { 'oozie_job_id': oozie_job_id, 'start_time': datetime.datetime.now() }) client.run_job(job_execution, oozie_job_id) return job_execution
def test_jar_creator_is_mapreduce(self): # Ensure that we get the MapReduce workflow factory for 'Jar' jobs job, _ = _create_all_stack('Jar') creator = workflow_factory.get_creator(job) self.assertEqual(type(creator), workflow_factory.MapReduceFactory)
def test_build_workflow_swift_configs(self, job_binary): # Test that swift configs come from either input or output data sources job, job_exec = _create_all_stack('Pig') job_binary.return_value = {"name": "script.pig"} input_data = _create_data_source('swift://ex.savanna/i') output_data = _create_data_source('hdfs://user/hadoop/out') creator = workflow_factory.get_creator(job) res = creator.get_workflow_xml(job_exec, input_data, output_data) self.assertIn(""" <configuration> <property> <name>fs.swift.service.savanna.password</name> <value>admin1</value> </property> <property> <name>fs.swift.service.savanna.username</name> <value>admin</value> </property> </configuration>""", res) input_data = _create_data_source('hdfs://user/hadoop/in') output_data = _create_data_source('swift://ex.savanna/o') creator = workflow_factory.get_creator(job) res = creator.get_workflow_xml(job_exec, input_data, output_data) self.assertIn(""" <configuration> <property> <name>fs.swift.service.savanna.password</name> <value>admin1</value> </property> <property> <name>fs.swift.service.savanna.username</name> <value>admin</value> </property> </configuration>""", res) job, job_exec = _create_all_stack('Pig', configs={'configs': {'dummy': 'value'}}) input_data = _create_data_source('hdfs://user/hadoop/in') output_data = _create_data_source('hdfs://user/hadoop/out') creator = workflow_factory.get_creator(job) res = creator.get_workflow_xml(job_exec, input_data, output_data) self.assertIn(""" <configuration> <property> <name>dummy</name> <value>value</value> </property> </configuration>""", res)