class Submission(object): """ Represents one unique Oozie submission. Actions are: - submit - rerun """ def __init__(self, user, job=None, fs=None, jt=None, properties=None, oozie_id=None, local_tz=None): self.job = job self.user = user self.fs = fs self.jt = jt # Deprecated with YARN, we now use logical names only for RM self.oozie_id = oozie_id self.api = get_oozie(self.user) if properties is not None: self.properties = properties else: self.properties = {} if local_tz and isinstance(self.job.data, dict): local_tz = self.job.data.get('properties')['timezone'] # Modify start_date & end_date only when it's a coordinator from oozie.models2 import Coordinator if type(self.job) is Coordinator: if 'start_date' in self.properties: properties['start_date'] = convert_to_server_timezone(self.properties['start_date'], local_tz) if 'end_date' in self.properties: properties['end_date'] = convert_to_server_timezone(self.properties['end_date'], local_tz) if 'nominal_time' in self.properties: properties['nominal_time'] = convert_to_server_timezone(self.properties['nominal_time'], local_tz) self.properties['security_enabled'] = self.api.security_enabled def __str__(self): if self.oozie_id: res = "Submission for job '%s'." % (self.oozie_id,) else: res = "Submission for job '%s' (id %s, owner %s)." % (self.job.name, self.job.id, self.user) if self.oozie_id: res += " -- " + self.oozie_id return res @submit_dryrun def run(self, deployment_dir=None): """ Take care of all the actions of submitting a Oozie workflow. Returns the oozie job id if all goes well. """ if self.properties and 'oozie.use.system.libpath' not in self.properties: self.properties['oozie.use.system.libpath'] = 'true' self.oozie_id = self.api.submit_job(self.properties) LOG.info("Submitted: %s" % (self,)) if self._is_workflow(): self.api.job_control(self.oozie_id, 'start') LOG.info("Started: %s" % (self,)) return self.oozie_id def rerun(self, deployment_dir, fail_nodes=None, skip_nodes=None): jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties(jt_address, deployment_dir) self.properties.update({'oozie.wf.application.path': deployment_dir}) if 'oozie.coord.application.path' in self.properties: self.properties.pop('oozie.coord.application.path') if 'oozie.bundle.application.path' in self.properties: self.properties.pop('oozie.bundle.application.path') if fail_nodes: self.properties.update({'oozie.wf.rerun.failnodes': fail_nodes}) elif not skip_nodes: self.properties.update({'oozie.wf.rerun.failnodes': 'false'}) # Case empty 'skip_nodes' list else: self.properties.update({'oozie.wf.rerun.skip.nodes': skip_nodes}) self.api.rerun(self.oozie_id, properties=self.properties) LOG.info("Rerun: %s" % (self,)) return self.oozie_id def rerun_coord(self, deployment_dir, params): jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties(jt_address, deployment_dir) self.properties.update({'oozie.coord.application.path': deployment_dir}) self.api.job_control(self.oozie_id, action='coord-rerun', properties=self.properties, parameters=params) LOG.info("Rerun: %s" % (self,)) return self.oozie_id def update_coord(self): self.api = get_oozie(self.user, api_version="v2") self.api.job_control(self.oozie_id, action='update', properties=self.properties, parameters=None) LOG.info("Update: %s" % (self,)) return self.oozie_id def rerun_bundle(self, deployment_dir, params): jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties(jt_address, deployment_dir) self.properties.update({'oozie.bundle.application.path': deployment_dir}) self.api.job_control(self.oozie_id, action='bundle-rerun', properties=self.properties, parameters=params) LOG.info("Rerun: %s" % (self,)) return self.oozie_id def deploy(self, deployment_dir=None): try: if not deployment_dir: deployment_dir = self._create_deployment_dir() except Exception, ex: msg = _("Failed to create deployment directory: %s" % ex) LOG.exception(msg) raise PopupException(message=msg, detail=str(ex)) if self.api.security_enabled: jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties(jt_address) # Needed for coordinator deploying workflows with credentials if hasattr(self.job, 'nodes'): for action in self.job.nodes: # Make sure XML is there # Don't support more than one level sub-workflow if action.data['type'] == 'subworkflow': from oozie.models2 import Workflow workflow = Workflow(document=Document2.objects.get_by_uuid(user=self.user, uuid=action.data['properties']['workflow'])) sub_deploy = Submission(self.user, workflow, self.fs, self.jt, self.properties) workspace = sub_deploy.deploy() self.job.override_subworkflow_id(action, workflow.id) # For displaying the correct graph self.properties['workspace_%s' % workflow.uuid] = workspace # For pointing to the correct workspace elif action.data['type'] == 'impala' or action.data['type'] == 'impala-document': from oozie.models2 import _get_impala_url from impala.impala_flags import get_ssl_server_certificate if action.data['type'] == 'impala-document': from notebook.models import Notebook if action.data['properties'].get('uuid'): notebook = Notebook(document=Document2.objects.get_by_uuid(user=self.user, uuid=action.data['properties']['uuid'])) statements = notebook.get_str() statements = Template(statements).safe_substitute(**self.properties) script_name = action.data['name'] + '.sql' self._create_file(deployment_dir, script_name, statements) else: script_name = os.path.basename(action.data['properties'].get('script_path')) if self.api.security_enabled: kinit = 'kinit -k -t *.keytab %(user_principal)s' % { 'user_principal': self.properties.get('user_principal', action.data['properties'].get('user_principal')) } else: kinit = '' shell_script = """#!/bin/bash # Needed to launch impala shell in oozie export PYTHON_EGG_CACHE=./myeggs %(kinit)s impala-shell %(kerberos_option)s %(ssl_option)s -i %(impalad_host)s -f %(query_file)s""" % { 'impalad_host': action.data['properties'].get('impalad_host') or _get_impala_url(), 'kerberos_option': '-k' if self.api.security_enabled else '', 'ssl_option': '--ssl' if get_ssl_server_certificate() else '', 'query_file': script_name, 'kinit': kinit } self._create_file(deployment_dir, action.data['name'] + '.sh', shell_script) elif action.data['type'] == 'hive-document': from notebook.models import Notebook if action.data['properties'].get('uuid'): notebook = Notebook(document=Document2.objects.get_by_uuid(user=self.user, uuid=action.data['properties']['uuid'])) statements = notebook.get_str() else: statements = action.data['properties'].get('statements') if self.properties.get('send_result_path'): statements = """ INSERT OVERWRITE DIRECTORY '%s' ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES ( "separatorChar" = "\t", "quoteChar" = "'", "escapeChar" = "\\" ) STORED AS TEXTFILE %s""" % (self.properties.get('send_result_path'), '\n\n\n'.join([snippet['statement_raw'] for snippet in notebook.get_data()['snippets']])) if statements is not None: self._create_file(deployment_dir, action.data['name'] + '.sql', statements) elif action.data['type'] in ('java-document', 'java', 'mapreduce-document'): if action.data['type'] == 'java-document' or action.data['type'] == 'mapreduce-document': from notebook.models import Notebook notebook = Notebook(document=Document2.objects.get_by_uuid(user=self.user, uuid=action.data['properties']['uuid'])) properties = notebook.get_data()['snippets'][0]['properties'] else: properties = action.data['properties'] if properties.get('app_jar'): LOG.debug("Adding to oozie.libpath %s" % properties['app_jar']) paths = [properties['app_jar']] if self.properties.get('oozie.libpath'): paths.append(self.properties['oozie.libpath']) self.properties['oozie.libpath'] = ','.join(paths) elif action.data['type'] == 'pig-document': from notebook.models import Notebook notebook = Notebook(document=Document2.objects.get_by_uuid(user=self.user, uuid=action.data['properties']['uuid'])) statements = notebook.get_data()['snippets'][0]['statement_raw'] self._create_file(deployment_dir, action.data['name'] + '.pig', statements) oozie_xml = self.job.to_xml(self.properties) self._do_as(self.user.username, self._copy_files, deployment_dir, oozie_xml, self.properties) return deployment_dir
class Submission(object): """ Represents one unique Oozie submission. Actions are: - submit - rerun """ def __init__(self, user, job=None, fs=None, jt=None, properties=None, oozie_id=None, local_tz=None): self.job = job self.user = user self.fs = fs self.jt = jt # Deprecated with YARN, we now use logical names only for RM self.oozie_id = oozie_id self.api = get_oozie(self.user) if properties is not None: self.properties = properties else: self.properties = {} if local_tz and isinstance(self.job.data, dict): local_tz = self.job.data.get('properties')['timezone'] # Modify start_date & end_date only when it's a coordinator from oozie.models2 import Coordinator if type(self.job) is Coordinator: if 'start_date' in self.properties: properties['start_date'] = convert_to_server_timezone( self.properties['start_date'], local_tz) if 'end_date' in self.properties: properties['end_date'] = convert_to_server_timezone( self.properties['end_date'], local_tz) if 'nominal_time' in self.properties: properties['nominal_time'] = convert_to_server_timezone( self.properties['nominal_time'], local_tz) self.properties['security_enabled'] = self.api.security_enabled def __str__(self): if self.oozie_id: res = "Submission for job '%s'." % (self.oozie_id, ) else: res = "Submission for job '%s' (id %s, owner %s)." % ( self.job.name, self.job.id, self.user) if self.oozie_id: res += " -- " + self.oozie_id return res @submit_dryrun def run(self, deployment_dir=None): """ Take care of all the actions of submitting a Oozie workflow. Returns the oozie job id if all goes well. """ if self.properties and 'oozie.use.system.libpath' not in self.properties: self.properties['oozie.use.system.libpath'] = 'true' self.oozie_id = self.api.submit_job(self.properties) LOG.info("Submitted: %s" % (self, )) if self._is_workflow(): self.api.job_control(self.oozie_id, 'start') LOG.info("Started: %s" % (self, )) return self.oozie_id def rerun(self, deployment_dir, fail_nodes=None, skip_nodes=None): jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties(jt_address, deployment_dir) self.properties.update({'oozie.wf.application.path': deployment_dir}) if 'oozie.coord.application.path' in self.properties: self.properties.pop('oozie.coord.application.path') if 'oozie.bundle.application.path' in self.properties: self.properties.pop('oozie.bundle.application.path') if fail_nodes: self.properties.update({'oozie.wf.rerun.failnodes': fail_nodes}) elif not skip_nodes: self.properties.update({'oozie.wf.rerun.failnodes': 'false'}) # Case empty 'skip_nodes' list else: self.properties.update({'oozie.wf.rerun.skip.nodes': skip_nodes}) self.api.rerun(self.oozie_id, properties=self.properties) LOG.info("Rerun: %s" % (self, )) return self.oozie_id def rerun_coord(self, deployment_dir, params): jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties(jt_address, deployment_dir) self.properties.update( {'oozie.coord.application.path': deployment_dir}) self.api.job_control(self.oozie_id, action='coord-rerun', properties=self.properties, parameters=params) LOG.info("Rerun: %s" % (self, )) return self.oozie_id def update_coord(self): self.api = get_oozie(self.user, api_version="v2") self.api.job_control(self.oozie_id, action='update', properties=self.properties, parameters=None) LOG.info("Update: %s" % (self, )) return self.oozie_id def rerun_bundle(self, deployment_dir, params): jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties(jt_address, deployment_dir) self.properties.update( {'oozie.bundle.application.path': deployment_dir}) self.api.job_control(self.oozie_id, action='bundle-rerun', properties=self.properties, parameters=params) LOG.info("Rerun: %s" % (self, )) return self.oozie_id def deploy(self, deployment_dir=None): try: if not deployment_dir: deployment_dir = self._create_deployment_dir() except Exception, ex: msg = _("Failed to create deployment directory: %s" % ex) LOG.exception(msg) raise PopupException(message=msg, detail=str(ex)) if self.api.security_enabled: jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties( jt_address ) # Needed for coordinator deploying workflows with credentials if hasattr(self.job, 'nodes'): for action in self.job.nodes: # Make sure XML is there # Don't support more than one level sub-workflow if action.data['type'] == 'subworkflow': from oozie.models2 import Workflow workflow = Workflow(document=Document2.objects.get_by_uuid( user=self.user, uuid=action.data['properties']['workflow'])) sub_deploy = Submission(self.user, workflow, self.fs, self.jt, self.properties) workspace = sub_deploy.deploy() self.job.override_subworkflow_id( action, workflow.id) # For displaying the correct graph self.properties[ 'workspace_%s' % workflow. uuid] = workspace # For pointing to the correct workspace elif action.data['type'] == 'altus': self._create_file( deployment_dir, action.data['name'] + '.sh', '''#!/usr/bin/env bash export PYTHONPATH=`pwd` echo 'Starting Altus command...' python altus.py ''') shell_script = self._generate_altus_action_script( service=action.data['properties'].get('service'), command=action.data['properties'].get('command'), arguments=dict([ arg.split('=', 1) for arg in action.data['properties'].get( 'arguments', []) ]), auth_key_id=ALTUS.AUTH_KEY_ID.get(), auth_key_secret=ALTUS.AUTH_KEY_SECRET.get().replace( '\\n', '\n')) self._create_file(deployment_dir, 'altus.py', shell_script) ext_py_lib_path = os.path.join(get_desktop_root(), 'core', 'ext-py') lib_dir_path = os.path.join(self.job.deployment_dir, 'lib') libs = [ (os.path.join(ext_py_lib_path, 'navoptapi-0.1.0'), 'navoptapi'), (os.path.join(ext_py_lib_path, 'navoptapi-0.1.0'), 'altuscli'), (os.path.join(ext_py_lib_path, 'asn1crypto-0.24.0'), 'asn1crypto'), (os.path.join(ext_py_lib_path, 'rsa-3.4.2'), 'rsa'), (os.path.join(ext_py_lib_path, 'pyasn1-0.1.8'), 'pyasn1'), ] for source_path, name in libs: destination_path = os.path.join(lib_dir_path, name) if not self.fs.do_as_user(self.user, self.fs.exists, destination_path): # Note: would be much faster to have only one zip archive self.fs.do_as_user(self.user, self.fs.copyFromLocal, os.path.join(source_path, name), destination_path) elif action.data['type'] == 'impala' or action.data[ 'type'] == 'impala-document': from oozie.models2 import _get_impala_url from impala.impala_flags import get_ssl_server_certificate if action.data['type'] == 'impala-document': from notebook.models import Notebook if action.data['properties'].get('uuid'): notebook = Notebook( document=Document2.objects.get_by_uuid( user=self.user, uuid=action.data['properties']['uuid'])) statements = notebook.get_str() statements = Template(statements).safe_substitute( **self.properties) script_name = action.data['name'] + '.sql' self._create_file(deployment_dir, script_name, statements) else: script_name = os.path.basename( action.data['properties'].get('script_path')) if self.api.security_enabled: kinit = 'kinit -k -t *.keytab %(user_principal)s' % { 'user_principal': self.properties.get( 'user_principal', action.data['properties']. get('user_principal')) } else: kinit = '' shell_script = """#!/bin/bash # Needed to launch impala shell in oozie export PYTHON_EGG_CACHE=./myeggs %(kinit)s impala-shell %(kerberos_option)s %(ssl_option)s -i %(impalad_host)s -f %(query_file)s""" % { 'impalad_host': action.data['properties'].get('impalad_host') or _get_impala_url(), 'kerberos_option': '-k' if self.api.security_enabled else '', 'ssl_option': '--ssl' if get_ssl_server_certificate() else '', 'query_file': script_name, 'kinit': kinit } self._create_file(deployment_dir, action.data['name'] + '.sh', shell_script) elif action.data['type'] == 'hive-document': from notebook.models import Notebook if action.data['properties'].get('uuid'): notebook = Notebook( document=Document2.objects.get_by_uuid( user=self.user, uuid=action.data['properties']['uuid'])) statements = notebook.get_str() else: statements = action.data['properties'].get( 'statements') if self.properties.get('send_result_path'): statements = """ INSERT OVERWRITE DIRECTORY '%s' ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES ( "separatorChar" = "\t", "quoteChar" = "'", "escapeChar" = "\\" ) STORED AS TEXTFILE %s""" % (self.properties.get('send_result_path'), '\n\n\n'.join([ snippet['statement_raw'] for snippet in notebook.get_data()['snippets'] ])) if statements is not None: self._create_file(deployment_dir, action.data['name'] + '.sql', statements) elif action.data['type'] in ('java-document', 'java', 'mapreduce-document'): if action.data['type'] == 'java-document' or action.data[ 'type'] == 'mapreduce-document': from notebook.models import Notebook notebook = Notebook( document=Document2.objects.get_by_uuid( user=self.user, uuid=action.data['properties']['uuid'])) properties = notebook.get_data( )['snippets'][0]['properties'] else: properties = action.data['properties'] if properties.get('app_jar'): LOG.debug("Adding to oozie.libpath %s" % properties['app_jar']) paths = [properties['app_jar']] if self.properties.get('oozie.libpath'): paths.append(self.properties['oozie.libpath']) self.properties['oozie.libpath'] = ','.join(paths) elif action.data['type'] == 'pig-document': from notebook.models import Notebook notebook = Notebook(document=Document2.objects.get_by_uuid( user=self.user, uuid=action.data['properties'] ['uuid'])) statements = notebook.get_data( )['snippets'][0]['statement_raw'] self._create_file(deployment_dir, action.data['name'] + '.pig', statements) elif action.data['type'] in ('spark', 'spark-document') or ( action.data['type'] in ('sqoop', 'sqoop-document') and action.data['properties']['statement'] in '--hive-import'): if not [ f for f in action.data.get('properties').get( 'files', []) if f.get('value').endswith('hive-site.xml') ]: hive_site_lib = Hdfs.join(deployment_dir + '/lib/', 'hive-site.xml') hive_site_content = get_hive_site_content() if not self.fs.do_as_user( self.user, self.fs.exists, hive_site_lib) and hive_site_content: self.fs.do_as_user( self.user, self.fs.create, hive_site_lib, overwrite=True, permission=0700, data=smart_str(hive_site_content)) if action.data['type'] in ('sqoop', 'sqoop-document'): if CONFIG_JDBC_LIBS_PATH.get( ) and CONFIG_JDBC_LIBS_PATH.get( ) not in self.properties.get('oozie.libpath', ''): LOG.debug("Adding to oozie.libpath %s" % CONFIG_JDBC_LIBS_PATH.get()) paths = [CONFIG_JDBC_LIBS_PATH.get()] if self.properties.get('oozie.libpath'): paths.append(self.properties['oozie.libpath']) self.properties['oozie.libpath'] = ','.join(paths) oozie_xml = self.job.to_xml(self.properties) self._do_as(self.user.username, self._copy_files, deployment_dir, oozie_xml, self.properties) return deployment_dir