示例#1
0
class Submission(object):
  """
  Represents one unique Oozie submission.

  Actions are:
  - submit
  - rerun
  """
  def __init__(self, user, job=None, fs=None, jt=None, properties=None, oozie_id=None, local_tz=None):
    self.job = job
    self.user = user
    self.fs = fs
    self.jt = jt # Deprecated with YARN, we now use logical names only for RM
    self.oozie_id = oozie_id
    self.api = get_oozie(self.user)

    if properties is not None:
      self.properties = properties
    else:
      self.properties = {}

    if local_tz and isinstance(self.job.data, dict):
      local_tz = self.job.data.get('properties')['timezone']

    # Modify start_date & end_date only when it's a coordinator
    from oozie.models2 import Coordinator
    if type(self.job) is Coordinator:
      if 'start_date' in self.properties:
        properties['start_date'] = convert_to_server_timezone(self.properties['start_date'], local_tz)
      if 'end_date' in self.properties:
        properties['end_date'] = convert_to_server_timezone(self.properties['end_date'], local_tz)

    if 'nominal_time' in self.properties:
      properties['nominal_time'] = convert_to_server_timezone(self.properties['nominal_time'], local_tz)

    self.properties['security_enabled'] = self.api.security_enabled

  def __str__(self):
    if self.oozie_id:
      res = "Submission for job '%s'." % (self.oozie_id,)
    else:
      res = "Submission for job '%s' (id %s, owner %s)." % (self.job.name, self.job.id, self.user)
    if self.oozie_id:
      res += " -- " + self.oozie_id
    return res

  @submit_dryrun
  def run(self, deployment_dir=None):
    """
    Take care of all the actions of submitting a Oozie workflow.
    Returns the oozie job id if all goes well.
    """

    if self.properties and 'oozie.use.system.libpath' not in self.properties:
      self.properties['oozie.use.system.libpath'] = 'true'

    self.oozie_id = self.api.submit_job(self.properties)
    LOG.info("Submitted: %s" % (self,))

    if self._is_workflow():
      self.api.job_control(self.oozie_id, 'start')
      LOG.info("Started: %s" % (self,))

    return self.oozie_id

  def rerun(self, deployment_dir, fail_nodes=None, skip_nodes=None):
    jt_address = cluster.get_cluster_addr_for_job_submission()

    self._update_properties(jt_address, deployment_dir)
    self.properties.update({'oozie.wf.application.path': deployment_dir})

    if 'oozie.coord.application.path' in self.properties:
      self.properties.pop('oozie.coord.application.path')

    if 'oozie.bundle.application.path' in self.properties:
      self.properties.pop('oozie.bundle.application.path')

    if fail_nodes:
      self.properties.update({'oozie.wf.rerun.failnodes': fail_nodes})
    elif not skip_nodes:
      self.properties.update({'oozie.wf.rerun.failnodes': 'false'}) # Case empty 'skip_nodes' list
    else:
      self.properties.update({'oozie.wf.rerun.skip.nodes': skip_nodes})

    self.api.rerun(self.oozie_id, properties=self.properties)

    LOG.info("Rerun: %s" % (self,))

    return self.oozie_id


  def rerun_coord(self, deployment_dir, params):
    jt_address = cluster.get_cluster_addr_for_job_submission()

    self._update_properties(jt_address, deployment_dir)
    self.properties.update({'oozie.coord.application.path': deployment_dir})

    self.api.job_control(self.oozie_id, action='coord-rerun', properties=self.properties, parameters=params)
    LOG.info("Rerun: %s" % (self,))

    return self.oozie_id

  def update_coord(self):
    self.api = get_oozie(self.user, api_version="v2")
    self.api.job_control(self.oozie_id, action='update', properties=self.properties, parameters=None)
    LOG.info("Update: %s" % (self,))

    return self.oozie_id

  def rerun_bundle(self, deployment_dir, params):
    jt_address = cluster.get_cluster_addr_for_job_submission()

    self._update_properties(jt_address, deployment_dir)
    self.properties.update({'oozie.bundle.application.path': deployment_dir})
    self.api.job_control(self.oozie_id, action='bundle-rerun', properties=self.properties, parameters=params)
    LOG.info("Rerun: %s" % (self,))

    return self.oozie_id


  def deploy(self, deployment_dir=None):
    try:
      if not deployment_dir:
        deployment_dir = self._create_deployment_dir()
    except Exception, ex:
      msg = _("Failed to create deployment directory: %s" % ex)
      LOG.exception(msg)
      raise PopupException(message=msg, detail=str(ex))

    if self.api.security_enabled:
      jt_address = cluster.get_cluster_addr_for_job_submission()
      self._update_properties(jt_address) # Needed for coordinator deploying workflows with credentials

    if hasattr(self.job, 'nodes'):
      for action in self.job.nodes:
        # Make sure XML is there
        # Don't support more than one level sub-workflow
        if action.data['type'] == 'subworkflow':
          from oozie.models2 import Workflow
          workflow = Workflow(document=Document2.objects.get_by_uuid(user=self.user, uuid=action.data['properties']['workflow']))
          sub_deploy = Submission(self.user, workflow, self.fs, self.jt, self.properties)
          workspace = sub_deploy.deploy()

          self.job.override_subworkflow_id(action, workflow.id) # For displaying the correct graph
          self.properties['workspace_%s' % workflow.uuid] = workspace # For pointing to the correct workspace

        elif action.data['type'] == 'impala' or action.data['type'] == 'impala-document':
          from oozie.models2 import _get_impala_url
          from impala.impala_flags import get_ssl_server_certificate

          if action.data['type'] == 'impala-document':
            from notebook.models import Notebook
            if action.data['properties'].get('uuid'):
              notebook = Notebook(document=Document2.objects.get_by_uuid(user=self.user, uuid=action.data['properties']['uuid']))
              statements = notebook.get_str()
              statements = Template(statements).safe_substitute(**self.properties)
              script_name = action.data['name'] + '.sql'
              self._create_file(deployment_dir, script_name, statements)
          else:
            script_name = os.path.basename(action.data['properties'].get('script_path'))

          if self.api.security_enabled:
            kinit = 'kinit -k -t *.keytab %(user_principal)s' % {
              'user_principal': self.properties.get('user_principal', action.data['properties'].get('user_principal'))
            }
          else:
            kinit = ''

          shell_script = """#!/bin/bash

# Needed to launch impala shell in oozie
export PYTHON_EGG_CACHE=./myeggs

%(kinit)s

impala-shell %(kerberos_option)s %(ssl_option)s -i %(impalad_host)s -f %(query_file)s""" % {
  'impalad_host': action.data['properties'].get('impalad_host') or _get_impala_url(),
  'kerberos_option': '-k' if self.api.security_enabled else '',
  'ssl_option': '--ssl' if get_ssl_server_certificate() else '',
  'query_file': script_name,
  'kinit': kinit
  }

          self._create_file(deployment_dir, action.data['name'] + '.sh', shell_script)

        elif action.data['type'] == 'hive-document':
          from notebook.models import Notebook
          if action.data['properties'].get('uuid'):
            notebook = Notebook(document=Document2.objects.get_by_uuid(user=self.user, uuid=action.data['properties']['uuid']))
            statements = notebook.get_str()
          else:
            statements = action.data['properties'].get('statements')

          if self.properties.get('send_result_path'):
            statements = """
INSERT OVERWRITE DIRECTORY '%s'
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
   "separatorChar" = "\t",
   "quoteChar"     = "'",
   "escapeChar"    = "\\"
)
STORED AS TEXTFILE %s""" % (self.properties.get('send_result_path'), '\n\n\n'.join([snippet['statement_raw'] for snippet in notebook.get_data()['snippets']]))

          if statements is not None:
            self._create_file(deployment_dir, action.data['name'] + '.sql', statements)

        elif action.data['type'] in ('java-document', 'java', 'mapreduce-document'):
          if action.data['type'] == 'java-document' or action.data['type'] == 'mapreduce-document':
            from notebook.models import Notebook
            notebook = Notebook(document=Document2.objects.get_by_uuid(user=self.user, uuid=action.data['properties']['uuid']))
            properties = notebook.get_data()['snippets'][0]['properties']
          else:
            properties = action.data['properties']

          if properties.get('app_jar'):
            LOG.debug("Adding to oozie.libpath %s" % properties['app_jar'])
            paths = [properties['app_jar']]
            if self.properties.get('oozie.libpath'):
              paths.append(self.properties['oozie.libpath'])
            self.properties['oozie.libpath'] = ','.join(paths)

        elif action.data['type'] == 'pig-document':
          from notebook.models import Notebook
          notebook = Notebook(document=Document2.objects.get_by_uuid(user=self.user, uuid=action.data['properties']['uuid']))
          statements = notebook.get_data()['snippets'][0]['statement_raw']

          self._create_file(deployment_dir, action.data['name'] + '.pig', statements)

    oozie_xml = self.job.to_xml(self.properties)
    self._do_as(self.user.username, self._copy_files, deployment_dir, oozie_xml, self.properties)

    return deployment_dir
示例#2
0
class Submission(object):
    """
  Represents one unique Oozie submission.

  Actions are:
  - submit
  - rerun
  """
    def __init__(self,
                 user,
                 job=None,
                 fs=None,
                 jt=None,
                 properties=None,
                 oozie_id=None,
                 local_tz=None):
        self.job = job
        self.user = user
        self.fs = fs
        self.jt = jt  # Deprecated with YARN, we now use logical names only for RM
        self.oozie_id = oozie_id
        self.api = get_oozie(self.user)

        if properties is not None:
            self.properties = properties
        else:
            self.properties = {}

        if local_tz and isinstance(self.job.data, dict):
            local_tz = self.job.data.get('properties')['timezone']

        # Modify start_date & end_date only when it's a coordinator
        from oozie.models2 import Coordinator
        if type(self.job) is Coordinator:
            if 'start_date' in self.properties:
                properties['start_date'] = convert_to_server_timezone(
                    self.properties['start_date'], local_tz)
            if 'end_date' in self.properties:
                properties['end_date'] = convert_to_server_timezone(
                    self.properties['end_date'], local_tz)

        if 'nominal_time' in self.properties:
            properties['nominal_time'] = convert_to_server_timezone(
                self.properties['nominal_time'], local_tz)

        self.properties['security_enabled'] = self.api.security_enabled

    def __str__(self):
        if self.oozie_id:
            res = "Submission for job '%s'." % (self.oozie_id, )
        else:
            res = "Submission for job '%s' (id %s, owner %s)." % (
                self.job.name, self.job.id, self.user)
        if self.oozie_id:
            res += " -- " + self.oozie_id
        return res

    @submit_dryrun
    def run(self, deployment_dir=None):
        """
    Take care of all the actions of submitting a Oozie workflow.
    Returns the oozie job id if all goes well.
    """

        if self.properties and 'oozie.use.system.libpath' not in self.properties:
            self.properties['oozie.use.system.libpath'] = 'true'

        self.oozie_id = self.api.submit_job(self.properties)
        LOG.info("Submitted: %s" % (self, ))

        if self._is_workflow():
            self.api.job_control(self.oozie_id, 'start')
            LOG.info("Started: %s" % (self, ))

        return self.oozie_id

    def rerun(self, deployment_dir, fail_nodes=None, skip_nodes=None):
        jt_address = cluster.get_cluster_addr_for_job_submission()

        self._update_properties(jt_address, deployment_dir)
        self.properties.update({'oozie.wf.application.path': deployment_dir})

        if 'oozie.coord.application.path' in self.properties:
            self.properties.pop('oozie.coord.application.path')

        if 'oozie.bundle.application.path' in self.properties:
            self.properties.pop('oozie.bundle.application.path')

        if fail_nodes:
            self.properties.update({'oozie.wf.rerun.failnodes': fail_nodes})
        elif not skip_nodes:
            self.properties.update({'oozie.wf.rerun.failnodes':
                                    'false'})  # Case empty 'skip_nodes' list
        else:
            self.properties.update({'oozie.wf.rerun.skip.nodes': skip_nodes})

        self.api.rerun(self.oozie_id, properties=self.properties)

        LOG.info("Rerun: %s" % (self, ))

        return self.oozie_id

    def rerun_coord(self, deployment_dir, params):
        jt_address = cluster.get_cluster_addr_for_job_submission()

        self._update_properties(jt_address, deployment_dir)
        self.properties.update(
            {'oozie.coord.application.path': deployment_dir})

        self.api.job_control(self.oozie_id,
                             action='coord-rerun',
                             properties=self.properties,
                             parameters=params)
        LOG.info("Rerun: %s" % (self, ))

        return self.oozie_id

    def update_coord(self):
        self.api = get_oozie(self.user, api_version="v2")
        self.api.job_control(self.oozie_id,
                             action='update',
                             properties=self.properties,
                             parameters=None)
        LOG.info("Update: %s" % (self, ))

        return self.oozie_id

    def rerun_bundle(self, deployment_dir, params):
        jt_address = cluster.get_cluster_addr_for_job_submission()

        self._update_properties(jt_address, deployment_dir)
        self.properties.update(
            {'oozie.bundle.application.path': deployment_dir})
        self.api.job_control(self.oozie_id,
                             action='bundle-rerun',
                             properties=self.properties,
                             parameters=params)
        LOG.info("Rerun: %s" % (self, ))

        return self.oozie_id

    def deploy(self, deployment_dir=None):
        try:
            if not deployment_dir:
                deployment_dir = self._create_deployment_dir()
        except Exception, ex:
            msg = _("Failed to create deployment directory: %s" % ex)
            LOG.exception(msg)
            raise PopupException(message=msg, detail=str(ex))

        if self.api.security_enabled:
            jt_address = cluster.get_cluster_addr_for_job_submission()
            self._update_properties(
                jt_address
            )  # Needed for coordinator deploying workflows with credentials

        if hasattr(self.job, 'nodes'):
            for action in self.job.nodes:
                # Make sure XML is there
                # Don't support more than one level sub-workflow
                if action.data['type'] == 'subworkflow':
                    from oozie.models2 import Workflow
                    workflow = Workflow(document=Document2.objects.get_by_uuid(
                        user=self.user,
                        uuid=action.data['properties']['workflow']))
                    sub_deploy = Submission(self.user, workflow, self.fs,
                                            self.jt, self.properties)
                    workspace = sub_deploy.deploy()

                    self.job.override_subworkflow_id(
                        action,
                        workflow.id)  # For displaying the correct graph
                    self.properties[
                        'workspace_%s' % workflow.
                        uuid] = workspace  # For pointing to the correct workspace

                elif action.data['type'] == 'altus':
                    self._create_file(
                        deployment_dir, action.data['name'] + '.sh',
                        '''#!/usr/bin/env bash

export PYTHONPATH=`pwd`

echo 'Starting Altus command...'

python altus.py

          ''')

                    shell_script = self._generate_altus_action_script(
                        service=action.data['properties'].get('service'),
                        command=action.data['properties'].get('command'),
                        arguments=dict([
                            arg.split('=', 1)
                            for arg in action.data['properties'].get(
                                'arguments', [])
                        ]),
                        auth_key_id=ALTUS.AUTH_KEY_ID.get(),
                        auth_key_secret=ALTUS.AUTH_KEY_SECRET.get().replace(
                            '\\n', '\n'))
                    self._create_file(deployment_dir, 'altus.py', shell_script)

                    ext_py_lib_path = os.path.join(get_desktop_root(), 'core',
                                                   'ext-py')
                    lib_dir_path = os.path.join(self.job.deployment_dir, 'lib')
                    libs = [
                        (os.path.join(ext_py_lib_path,
                                      'navoptapi-0.1.0'), 'navoptapi'),
                        (os.path.join(ext_py_lib_path,
                                      'navoptapi-0.1.0'), 'altuscli'),
                        (os.path.join(ext_py_lib_path,
                                      'asn1crypto-0.24.0'), 'asn1crypto'),
                        (os.path.join(ext_py_lib_path, 'rsa-3.4.2'), 'rsa'),
                        (os.path.join(ext_py_lib_path,
                                      'pyasn1-0.1.8'), 'pyasn1'),
                    ]
                    for source_path, name in libs:
                        destination_path = os.path.join(lib_dir_path, name)
                        if not self.fs.do_as_user(self.user, self.fs.exists,
                                                  destination_path):
                            # Note: would be much faster to have only one zip archive
                            self.fs.do_as_user(self.user,
                                               self.fs.copyFromLocal,
                                               os.path.join(source_path, name),
                                               destination_path)

                elif action.data['type'] == 'impala' or action.data[
                        'type'] == 'impala-document':
                    from oozie.models2 import _get_impala_url
                    from impala.impala_flags import get_ssl_server_certificate

                    if action.data['type'] == 'impala-document':
                        from notebook.models import Notebook
                        if action.data['properties'].get('uuid'):
                            notebook = Notebook(
                                document=Document2.objects.get_by_uuid(
                                    user=self.user,
                                    uuid=action.data['properties']['uuid']))
                            statements = notebook.get_str()
                            statements = Template(statements).safe_substitute(
                                **self.properties)
                            script_name = action.data['name'] + '.sql'
                            self._create_file(deployment_dir, script_name,
                                              statements)
                    else:
                        script_name = os.path.basename(
                            action.data['properties'].get('script_path'))

                    if self.api.security_enabled:
                        kinit = 'kinit -k -t *.keytab %(user_principal)s' % {
                            'user_principal':
                            self.properties.get(
                                'user_principal', action.data['properties'].
                                get('user_principal'))
                        }
                    else:
                        kinit = ''

                    shell_script = """#!/bin/bash

# Needed to launch impala shell in oozie
export PYTHON_EGG_CACHE=./myeggs

%(kinit)s

impala-shell %(kerberos_option)s %(ssl_option)s -i %(impalad_host)s -f %(query_file)s""" % {
                        'impalad_host':
                        action.data['properties'].get('impalad_host')
                        or _get_impala_url(),
                        'kerberos_option':
                        '-k' if self.api.security_enabled else '',
                        'ssl_option':
                        '--ssl' if get_ssl_server_certificate() else '',
                        'query_file':
                        script_name,
                        'kinit':
                        kinit
                    }

                    self._create_file(deployment_dir,
                                      action.data['name'] + '.sh',
                                      shell_script)

                elif action.data['type'] == 'hive-document':
                    from notebook.models import Notebook
                    if action.data['properties'].get('uuid'):
                        notebook = Notebook(
                            document=Document2.objects.get_by_uuid(
                                user=self.user,
                                uuid=action.data['properties']['uuid']))
                        statements = notebook.get_str()
                    else:
                        statements = action.data['properties'].get(
                            'statements')

                    if self.properties.get('send_result_path'):
                        statements = """
INSERT OVERWRITE DIRECTORY '%s'
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
   "separatorChar" = "\t",
   "quoteChar"     = "'",
   "escapeChar"    = "\\"
)
STORED AS TEXTFILE %s""" % (self.properties.get('send_result_path'),
                            '\n\n\n'.join([
                                snippet['statement_raw']
                                for snippet in notebook.get_data()['snippets']
                            ]))

                    if statements is not None:
                        self._create_file(deployment_dir,
                                          action.data['name'] + '.sql',
                                          statements)

                elif action.data['type'] in ('java-document', 'java',
                                             'mapreduce-document'):
                    if action.data['type'] == 'java-document' or action.data[
                            'type'] == 'mapreduce-document':
                        from notebook.models import Notebook
                        notebook = Notebook(
                            document=Document2.objects.get_by_uuid(
                                user=self.user,
                                uuid=action.data['properties']['uuid']))
                        properties = notebook.get_data(
                        )['snippets'][0]['properties']
                    else:
                        properties = action.data['properties']

                    if properties.get('app_jar'):
                        LOG.debug("Adding to oozie.libpath %s" %
                                  properties['app_jar'])
                        paths = [properties['app_jar']]
                        if self.properties.get('oozie.libpath'):
                            paths.append(self.properties['oozie.libpath'])
                        self.properties['oozie.libpath'] = ','.join(paths)

                elif action.data['type'] == 'pig-document':
                    from notebook.models import Notebook
                    notebook = Notebook(document=Document2.objects.get_by_uuid(
                        user=self.user, uuid=action.data['properties']
                        ['uuid']))
                    statements = notebook.get_data(
                    )['snippets'][0]['statement_raw']

                    self._create_file(deployment_dir,
                                      action.data['name'] + '.pig', statements)
                elif action.data['type'] in ('spark', 'spark-document') or (
                        action.data['type'] in ('sqoop', 'sqoop-document')
                        and action.data['properties']['statement']
                        in '--hive-import'):
                    if not [
                            f for f in action.data.get('properties').get(
                                'files', [])
                            if f.get('value').endswith('hive-site.xml')
                    ]:
                        hive_site_lib = Hdfs.join(deployment_dir + '/lib/',
                                                  'hive-site.xml')
                        hive_site_content = get_hive_site_content()
                        if not self.fs.do_as_user(
                                self.user, self.fs.exists,
                                hive_site_lib) and hive_site_content:
                            self.fs.do_as_user(
                                self.user,
                                self.fs.create,
                                hive_site_lib,
                                overwrite=True,
                                permission=0700,
                                data=smart_str(hive_site_content))
                    if action.data['type'] in ('sqoop', 'sqoop-document'):
                        if CONFIG_JDBC_LIBS_PATH.get(
                        ) and CONFIG_JDBC_LIBS_PATH.get(
                        ) not in self.properties.get('oozie.libpath', ''):
                            LOG.debug("Adding to oozie.libpath %s" %
                                      CONFIG_JDBC_LIBS_PATH.get())
                            paths = [CONFIG_JDBC_LIBS_PATH.get()]
                            if self.properties.get('oozie.libpath'):
                                paths.append(self.properties['oozie.libpath'])
                            self.properties['oozie.libpath'] = ','.join(paths)

        oozie_xml = self.job.to_xml(self.properties)
        self._do_as(self.user.username, self._copy_files, deployment_dir,
                    oozie_xml, self.properties)

        return deployment_dir