Exemplo n.º 1
0
    def run_job(self, job_execution):
        ctx = context.ctx()

        job = conductor.job_get(ctx, job_execution.job_id)
        input_source, output_source = job_utils.get_data_sources(
            job_execution, job)

        # Updated_job_configs will be a copy of job_execution.job_configs with
        # any name or uuid references to data_sources resolved to paths
        # assuming substitution is enabled.
        # If substitution is not enabled then updated_job_configs will
        # just be a reference to job_execution.job_configs to avoid a copy.
        # Additional_sources will be a list of any data_sources found.
        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(
                job_execution.job_configs))

        proxy_configs = updated_job_configs.get('proxy_configs')
        configs = updated_job_configs.get('configs', {})

        for data_source in [input_source, output_source] + additional_sources:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(self.cluster, data_source)
                break

        hdfs_user = self.get_hdfs_user()

        # TODO(tmckay): this should probably be "get_namenode"
        # but that call does not exist in the oozie engine api now.
        oozie_server = self.get_oozie_server(self.cluster)

        wf_dir = self._create_hdfs_workflow_dir(oozie_server, job)
        self._upload_job_files_to_hdfs(oozie_server, wf_dir, job, configs,
                                       proxy_configs)

        wf_xml = workflow_factory.get_workflow_xml(job, self.cluster,
                                                   updated_job_configs,
                                                   input_source, output_source,
                                                   hdfs_user)

        path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir,
                                                      wf_xml, hdfs_user)

        job_params = self._get_oozie_job_params(hdfs_user, path_to_workflow)

        client = self._get_client()
        oozie_job_id = client.add_job(x.create_hadoop_xml(job_params),
                                      job_execution)
        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)
        client.run_job(job_execution, oozie_job_id)
        try:
            status = client.get_job_status(job_execution,
                                           oozie_job_id)['status']
        except Exception:
            status = None
        return (oozie_job_id, status, None)
Exemplo n.º 2
0
def run_job(job_execution):
    ctx = context.ctx()

    cluster = conductor.cluster_get(ctx, job_execution.cluster_id)
    if cluster.status != 'Active':
        return job_execution

    job = conductor.job_get(ctx, job_execution.job_id)
    if not edp.compare_job_type(job.type, edp.JOB_TYPE_JAVA):
        input_source = conductor.data_source_get(ctx,  job_execution.input_id)
        output_source = conductor.data_source_get(ctx, job_execution.output_id)
    else:
        input_source = None
        output_source = None
    #TODO(nprivalova): should be removed after all features implemented
    validate(input_source, output_source, job)

    for data_source in [input_source, output_source]:
        if data_source and data_source.type == 'hdfs':
            h.configure_cluster_for_hdfs(cluster, data_source)

    hdfs_user = _get_hdfs_user(cluster)
    oozie_server = _get_oozie_server(cluster)
    wf_dir = create_workflow_dir(oozie_server, job, hdfs_user)
    upload_job_files(oozie_server, wf_dir, job, hdfs_user)

    creator = workflow_factory.get_creator(job)

    wf_xml = creator.get_workflow_xml(cluster, job_execution,
                                      input_source, output_source)

    path_to_workflow = upload_workflow_file(oozie_server,
                                            wf_dir, wf_xml, hdfs_user)

    rm_path = _get_resource_manager_path(cluster)
    nn_path = cluster['info']['HDFS']['NameNode']

    client = o.OozieClient(cluster['info']['JobFlow']['Oozie'] + "/oozie/",
                           _get_oozie_server(cluster))
    job_parameters = {"jobTracker": rm_path,
                      "nameNode": nn_path,
                      "user.name": hdfs_user,
                      "oozie.wf.application.path":
                      "%s%s" % (nn_path, path_to_workflow),
                      "oozie.use.system.libpath": "true"}

    oozie_job_id = client.add_job(x.create_hadoop_xml(job_parameters),
                                  job_execution)
    job_execution = conductor.job_execution_update(ctx, job_execution,
                                                   {'oozie_job_id':
                                                    oozie_job_id,
                                                    'start_time':
                                                    datetime.datetime.now()})
    client.run_job(job_execution, oozie_job_id)

    return job_execution
Exemplo n.º 3
0
def run_job(job_execution):
    ctx = context.ctx()

    cluster = conductor.cluster_get(ctx, job_execution.cluster_id)
    if cluster.status != 'Active':
        return job_execution

    job = conductor.job_get(ctx, job_execution.job_id)
    if not edp.compare_job_type(job.type, 'Java'):
        input_source = conductor.data_source_get(ctx,  job_execution.input_id)
        output_source = conductor.data_source_get(ctx, job_execution.output_id)
    else:
        input_source = None
        output_source = None
    #TODO(nprivalova): should be removed after all features implemented
    validate(input_source, output_source, job)

    for data_source in [input_source, output_source]:
        if data_source and data_source.type == 'hdfs':
            h.configure_cluster_for_hdfs(cluster, data_source)

    hdfs_user = _get_hdfs_user(cluster)
    oozie_server = _get_oozie_server(cluster)
    wf_dir = create_workflow_dir(oozie_server, job, hdfs_user)
    upload_job_files(oozie_server, wf_dir, job, hdfs_user)

    creator = workflow_factory.get_creator(job)

    wf_xml = creator.get_workflow_xml(cluster, job_execution,
                                      input_source, output_source)

    path_to_workflow = upload_workflow_file(oozie_server,
                                            wf_dir, wf_xml, hdfs_user)

    rm_path = _get_resource_manager_path(cluster)
    nn_path = cluster['info']['HDFS']['NameNode']

    client = o.OozieClient(cluster['info']['JobFlow']['Oozie'] + "/oozie/",
                           _get_oozie_server(cluster))
    job_parameters = {"jobTracker": rm_path,
                      "nameNode": nn_path,
                      "user.name": hdfs_user,
                      "oozie.wf.application.path":
                      "%s%s" % (nn_path, path_to_workflow),
                      "oozie.use.system.libpath": "true"}

    oozie_job_id = client.add_job(x.create_hadoop_xml(job_parameters),
                                  job_execution)
    job_execution = conductor.job_execution_update(ctx, job_execution,
                                                   {'oozie_job_id':
                                                    oozie_job_id,
                                                    'start_time':
                                                    datetime.datetime.now()})
    client.run_job(job_execution, oozie_job_id)

    return job_execution
Exemplo n.º 4
0
 def test_configure_cluster_for_hdfs(self, mock_helper, mock_get, mock_six):
     inst = mock.MagicMock()
     inst.remote = mock.MagicMock()
     mock_six.return_value = 111
     str1 = '/tmp/etc-hosts-update.111'
     str2 = ('cat /tmp/etc-hosts-update.111 /etc/hosts | sort | uniq > '
             '/tmp/etc-hosts.111 && cat /tmp/etc-hosts.111 > '
             '/etc/hosts && rm -f /tmp/etc-hosts.111 '
             '/tmp/etc-hosts-update.111')
     mock_get.return_value = [inst]
     helper.configure_cluster_for_hdfs(self.cluster, "www.host.ru")
     inst.remote.assert_has_calls(
         [mock.call(), mock.call().__enter__(),
          mock.call().__enter__().write_file_to(str1, mock_helper()),
          mock.call().__enter__().execute_command(str2, run_as_root=True),
          mock.call().__exit__(None, None, None)])
Exemplo n.º 5
0
 def test_configure_cluster_for_hdfs(self, mock_helper, mock_get, mock_six):
     inst = mock.MagicMock()
     inst.remote = mock.MagicMock()
     mock_six.return_value = 111
     str1 = '/tmp/etc-hosts-update.111'
     str2 = ('cat /tmp/etc-hosts-update.111 /etc/hosts | sort | uniq > '
             '/tmp/etc-hosts.111 && cat /tmp/etc-hosts.111 > '
             '/etc/hosts && rm -f /tmp/etc-hosts.111 '
             '/tmp/etc-hosts-update.111')
     mock_get.return_value = [inst]
     helper.configure_cluster_for_hdfs(self.cluster, "www.host.ru")
     inst.remote.assert_has_calls(
         [mock.call(), mock.call().__enter__(),
          mock.call().__enter__().write_file_to(str1, mock_helper()),
          mock.call().__enter__().execute_command(str2, run_as_root=True),
          mock.call().__exit__(None, None, None)])
Exemplo n.º 6
0
    def run_job(self, job_execution):
        ctx = context.ctx()

        job = conductor.job_get(ctx, job_execution.job_id)
        input_source, output_source = job_utils.get_data_sources(job_execution,
                                                                 job)
        proxy_configs = job_execution.job_configs.get('proxy_configs')

        for data_source in [input_source, output_source]:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(self.cluster, data_source)
                break

        hdfs_user = self.get_hdfs_user()

        # TODO(tmckay): this should probably be "get_namenode"
        # but that call does not exist in the oozie engine api now.
        oozie_server = self.get_oozie_server(self.cluster)

        wf_dir = self._create_hdfs_workflow_dir(oozie_server, job)
        self._upload_job_files_to_hdfs(oozie_server, wf_dir, job,
                                       proxy_configs)

        wf_xml = workflow_factory.get_workflow_xml(
            job, self.cluster, job_execution, input_source, output_source,
            hdfs_user)

        path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir,
                                                      wf_xml, hdfs_user)

        job_params = self._get_oozie_job_params(hdfs_user,
                                                path_to_workflow)

        client = self._get_client()
        oozie_job_id = client.add_job(x.create_hadoop_xml(job_params),
                                      job_execution)
        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)
        client.run_job(job_execution, oozie_job_id)
        try:
            status = client.get_job_status(job_execution,
                                           oozie_job_id)['status']
        except Exception:
            status = None
        return (oozie_job_id, status, None)
Exemplo n.º 7
0
    def run_job(self, job_execution):
        ctx = context.ctx()

        job = conductor.job_get(ctx, job_execution.job_id)
        input_source, output_source = job_utils.get_data_sources(job_execution,
                                                                 job)

        for data_source in [input_source, output_source]:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(self.cluster, data_source)
                break

        hdfs_user = self.plugin.get_hdfs_user()

        # TODO(tmckay): this should probably be "get_namenode"
        # but that call does not exist in the plugin api now.
        # However, other engines may need it.
        oozie_server = self.plugin.get_oozie_server(self.cluster)

        wf_dir = job_utils.create_hdfs_workflow_dir(oozie_server,
                                                    job, hdfs_user)
        job_utils.upload_job_files_to_hdfs(oozie_server, wf_dir,
                                           job, hdfs_user)

        wf_xml = workflow_factory.get_workflow_xml(
            job, self.cluster, job_execution, input_source, output_source)

        path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir,
                                                      wf_xml, hdfs_user)

        job_params = self._get_oozie_job_params(hdfs_user,
                                                path_to_workflow)

        client = self._get_client()
        oozie_job_id = client.add_job(x.create_hadoop_xml(job_params),
                                      job_execution)
        client.run_job(job_execution, oozie_job_id)
        try:
            status = client.get_job_status(job_execution,
                                           oozie_job_id)['status']
        except Exception:
            status = None
        return (oozie_job_id, status, None)
Exemplo n.º 8
0
def _run_job(job_execution_id):
    ctx = context.ctx()

    job_execution = conductor.job_execution_get(ctx, job_execution_id)

    cluster = conductor.cluster_get(ctx, job_execution.cluster_id)
    if cluster.status != 'Active':
        return

    job_execution = _update_job_execution_extra(job_execution, cluster)

    job = conductor.job_get(ctx, job_execution.job_id)
    input_source, output_source = _get_data_sources(job_execution, job)

    for data_source in [input_source, output_source]:
        if data_source and data_source.type == 'hdfs':
            h.configure_cluster_for_hdfs(cluster, data_source)

    plugin = _get_plugin(cluster)
    hdfs_user = plugin.get_hdfs_user()
    oozie_server = plugin.get_oozie_server(cluster)

    wf_dir = create_workflow_dir(oozie_server, job, hdfs_user)
    upload_job_files(oozie_server, wf_dir, job, hdfs_user)

    wf_xml = workflow_factory.get_workflow_xml(
        job, cluster, job_execution, input_source, output_source)

    path_to_workflow = upload_workflow_file(oozie_server,
                                            wf_dir, wf_xml, hdfs_user)

    client = _create_oozie_client(cluster)
    job_params = _get_oozie_job_params(cluster, hdfs_user, path_to_workflow)
    oozie_job_id = client.add_job(x.create_hadoop_xml(job_params),
                                  job_execution)
    job_execution = conductor.job_execution_update(
        ctx, job_execution, {'oozie_job_id': oozie_job_id,
                             'start_time': datetime.datetime.now()})
    client.run_job(job_execution, oozie_job_id)
Exemplo n.º 9
0
    def run_job(self, job_execution):
        ctx = context.ctx()
        job = conductor.job_get(ctx, job_execution.job_id)
        indep_params = {}
        data_source_urls = {}
        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(
                job_execution.job_configs, job_execution.id, data_source_urls)
        )

        job_execution = conductor.job_execution_update(
            ctx, job_execution, {"data_source_urls": data_source_urls})

        for data_source in additional_sources:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(self.cluster, data_source)
                break

        # It is needed in case we are working with Spark plugin
        self.plugin_params['master'] = (
            self.plugin_params['master'] % {'host': self.master.hostname()})

        # TODO(tmckay): wf_dir should probably be configurable.
        # The only requirement is that the dir is writable by the image user
        wf_dir = job_utils.create_workflow_dir(self.master, '/tmp/spark-edp',
                                               job, job_execution.id, "700")
        paths, builtin_paths = self._upload_job_files(
            self.master, wf_dir, job, updated_job_configs)

        # We can shorten the paths in this case since we'll run out of wf_dir
        paths = [os.path.basename(p) for p in paths]
        builtin_paths = [os.path.basename(p) for p in builtin_paths]

        # TODO(tmckay): for now, paths[0] is always assumed to be the app
        # jar and we generate paths in order (mains, then libs).
        # When we have a Spark job type, we can require a "main" and set
        # the app jar explicitly to be "main"
        indep_params["app_jar"] = paths.pop(0)
        indep_params["job_class"] = (
            updated_job_configs["configs"]["edp.java.main_class"])

        # If we uploaded builtins then we are using a wrapper jar. It will
        # be the first one on the builtin list and the original app_jar needs
        # to be added to the  'additional' jars
        if builtin_paths:
            indep_params["wrapper_jar"] = builtin_paths.pop(0)
            indep_params["wrapper_class"] = (
                'org.openstack.sahara.edp.SparkWrapper')
            wrapper_xml = self._upload_wrapper_xml(self.master,
                                                   wf_dir,
                                                   updated_job_configs)
            indep_params["wrapper_args"] = "%s %s" % (
                wrapper_xml, indep_params["job_class"])

            indep_params["addnl_jars"] = ",".join(
                [indep_params["app_jar"]] + paths + builtin_paths)

        else:
            indep_params["addnl_jars"] = ",".join(paths)

        # All additional jars are passed with the --jars option
        if indep_params["addnl_jars"]:
            indep_params["addnl_jars"] = (
                " --jars " + indep_params["addnl_jars"])

        # Launch the spark job using spark-submit and deploy_mode = client
        # TODO(tmckay): we need to clean up wf_dirs on long running clusters
        # TODO(tmckay): probably allow for general options to spark-submit
        indep_params["args"] = updated_job_configs.get('args', [])
        indep_params["args"] = " ".join([su.inject_swift_url_suffix(arg)
                                         for arg in indep_params["args"]])
        if indep_params.get("args"):
            indep_params["args"] = (" " + indep_params["args"])

        mutual_dict = self.plugin_params.copy()
        mutual_dict.update(indep_params)
        if mutual_dict.get("wrapper_jar"):
            # Substrings which may be empty have spaces
            # embedded if they are non-empty
            cmd = (
                '%(spark-user)s%(spark-submit)s%(driver-class-path)s'
                ' --class %(wrapper_class)s%(addnl_jars)s'
                ' --master %(master)s'
                ' --deploy-mode %(deploy-mode)s'
                ' %(wrapper_jar)s %(wrapper_args)s%(args)s') % dict(
                mutual_dict)
        else:
            cmd = (
                '%(spark-user)s%(spark-submit)s'
                ' --class %(job_class)s%(addnl_jars)s'
                ' --master %(master)s'
                ' --deploy-mode %(deploy-mode)s'
                ' %(app_jar)s%(args)s') % dict(
                mutual_dict)
        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)

        # If an exception is raised here, the job_manager will mark
        # the job failed and log the exception
        # The redirects of stdout and stderr will preserve output in the wf_dir
        with remote.get_remote(self.master) as r:
            # Upload the command launch script
            launch = os.path.join(wf_dir, "launch_command")
            r.write_file_to(launch, self._job_script())
            r.execute_command("chmod u+rwx,g+rx,o+rx %s" % wf_dir)
            r.execute_command("chmod +x %s" % launch)
            ret, stdout = r.execute_command(
                "cd %s; ./launch_command %s > /dev/null 2>&1 & echo $!"
                % (wf_dir, cmd))

        if ret == 0:
            # Success, we'll add the wf_dir in job_execution.extra and store
            # pid@instance_id as the job id
            # We know the job is running so return "RUNNING"
            return (stdout.strip() + "@" + self.master.id,
                    edp.JOB_STATUS_RUNNING,
                    {'spark-path': wf_dir})

        # Hmm, no execption but something failed.
        # Since we're using backgrounding with redirect, this is unlikely.
        raise e.EDPError(_("Spark job execution failed. Exit status = "
                           "%(status)s, stdout = %(stdout)s") %
                         {'status': ret, 'stdout': stdout})
Exemplo n.º 10
0
    def run_job(self, job_execution):
        ctx = context.ctx()
        job = conductor.job_get(ctx, job_execution.job_id)

        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(job_execution.job_configs)
        )

        for data_source in additional_sources:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(self.cluster, data_source)
                break

        # We'll always run the driver program on the master
        master = plugin_utils.get_instance(self.cluster, "master")

        # TODO(tmckay): wf_dir should probably be configurable.
        # The only requirement is that the dir is writable by the image user
        wf_dir = job_utils.create_workflow_dir(master, '/tmp/spark-edp', job,
                                               job_execution.id, "700")

        paths, builtin_paths = self._upload_job_files(
            master, wf_dir, job, updated_job_configs)

        # We can shorten the paths in this case since we'll run out of wf_dir
        paths = [os.path.basename(p) for p in paths]
        builtin_paths = [os.path.basename(p) for p in builtin_paths]

        # TODO(tmckay): for now, paths[0] is always assumed to be the app
        # jar and we generate paths in order (mains, then libs).
        # When we have a Spark job type, we can require a "main" and set
        # the app jar explicitly to be "main"
        app_jar = paths.pop(0)
        job_class = updated_job_configs["configs"]["edp.java.main_class"]

        # If we uploaded builtins then we are using a wrapper jar. It will
        # be the first one on the builtin list and the original app_jar needs
        # to be added to the  'additional' jars
        if builtin_paths:
            wrapper_jar = builtin_paths.pop(0)
            wrapper_class = 'org.openstack.sahara.edp.SparkWrapper'
            wrapper_xml = self._upload_wrapper_xml(master,
                                                   wf_dir,
                                                   updated_job_configs)
            wrapper_args = "%s %s" % (wrapper_xml, job_class)

            additional_jars = ",".join([app_jar] + paths + builtin_paths)

        else:
            wrapper_jar = wrapper_class = wrapper_args = ""
            additional_jars = ",".join(paths)

        # All additional jars are passed with the --jars option
        if additional_jars:
            additional_jars = " --jars " + additional_jars

        # Launch the spark job using spark-submit and deploy_mode = client
        host = master.hostname()
        port = c_helper.get_config_value("Spark", "Master port", self.cluster)
        spark_submit = os.path.join(
            c_helper.get_config_value("Spark",
                                      "Spark home",
                                      self.cluster),
            "bin/spark-submit")

        # TODO(tmckay): we need to clean up wf_dirs on long running clusters
        # TODO(tmckay): probably allow for general options to spark-submit
        args = updated_job_configs.get('args', [])
        args = " ".join([su.inject_swift_url_suffix(arg) for arg in args])
        if args:
            args = " " + args

        if wrapper_jar and wrapper_class:
            # Substrings which may be empty have spaces
            # embedded if they are non-empty
            cmd = (
                '%(spark_submit)s%(driver_cp)s'
                ' --class %(wrapper_class)s%(addnl_jars)s'
                ' --master spark://%(host)s:%(port)s'
                ' %(wrapper_jar)s %(wrapper_args)s%(args)s') % (
                {
                    "spark_submit": spark_submit,
                    "driver_cp": self.get_driver_classpath(),
                    "wrapper_class": wrapper_class,
                    "addnl_jars": additional_jars,
                    "host": host,
                    "port": port,
                    "wrapper_jar": wrapper_jar,
                    "wrapper_args": wrapper_args,
                    "args": args
                })
        else:
            cmd = (
                '%(spark_submit)s --class %(job_class)s%(addnl_jars)s'
                ' --master spark://%(host)s:%(port)s %(app_jar)s%(args)s') % (
                {
                    "spark_submit": spark_submit,
                    "job_class": job_class,
                    "addnl_jars": additional_jars,
                    "host": host,
                    "port": port,
                    "app_jar": app_jar,
                    "args": args
                })

        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)

        # If an exception is raised here, the job_manager will mark
        # the job failed and log the exception
        # The redirects of stdout and stderr will preserve output in the wf_dir
        with remote.get_remote(master) as r:
            # Upload the command launch script
            launch = os.path.join(wf_dir, "launch_command")
            r.write_file_to(launch, self._job_script())
            r.execute_command("chmod +x %s" % launch)
            ret, stdout = r.execute_command(
                "cd %s; ./launch_command %s > /dev/null 2>&1 & echo $!"
                % (wf_dir, cmd))

        if ret == 0:
            # Success, we'll add the wf_dir in job_execution.extra and store
            # pid@instance_id as the job id
            # We know the job is running so return "RUNNING"
            return (stdout.strip() + "@" + master.id,
                    edp.JOB_STATUS_RUNNING,
                    {'spark-path': wf_dir})

        # Hmm, no execption but something failed.
        # Since we're using backgrounding with redirect, this is unlikely.
        raise e.EDPError(_("Spark job execution failed. Exit status = "
                           "%(status)s, stdout = %(stdout)s") %
                         {'status': ret, 'stdout': stdout})
Exemplo n.º 11
0
    def run_job(self, job_execution):
        ctx = context.ctx()
        job = conductor.job_get(ctx, job_execution.job_id)
        indep_params = {}
        data_source_urls = {}
        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(job_execution.job_configs,
                                                     job_execution.id,
                                                     data_source_urls))

        job_execution = conductor.job_execution_update(
            ctx, job_execution, {"data_source_urls": data_source_urls})

        for data_source in additional_sources:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(self.cluster, data_source)
                break

        # It is needed in case we are working with Spark plugin
        self.plugin_params['master'] = (self.plugin_params['master'] % {
            'host': self.master.hostname()
        })

        # TODO(tmckay): wf_dir should probably be configurable.
        # The only requirement is that the dir is writable by the image user
        wf_dir = job_utils.create_workflow_dir(self.master, '/tmp/spark-edp',
                                               job, job_execution.id, "700")
        paths, builtin_paths = self._upload_job_files(self.master, wf_dir, job,
                                                      updated_job_configs)

        # We can shorten the paths in this case since we'll run out of wf_dir
        paths = [os.path.basename(p) for p in paths]
        builtin_paths = [os.path.basename(p) for p in builtin_paths]

        # TODO(tmckay): for now, paths[0] is always assumed to be the app
        # jar and we generate paths in order (mains, then libs).
        # When we have a Spark job type, we can require a "main" and set
        # the app jar explicitly to be "main"
        indep_params["app_jar"] = paths.pop(0)
        indep_params["job_class"] = (
            updated_job_configs["configs"]["edp.java.main_class"])

        # If we uploaded builtins then we are using a wrapper jar. It will
        # be the first one on the builtin list and the original app_jar needs
        # to be added to the  'additional' jars
        if builtin_paths:
            indep_params["wrapper_jar"] = builtin_paths.pop(0)
            indep_params["wrapper_class"] = (
                'org.openstack.sahara.edp.SparkWrapper')
            wrapper_xml = self._upload_wrapper_xml(self.master, wf_dir,
                                                   updated_job_configs)
            indep_params["wrapper_args"] = "%s %s" % (
                wrapper_xml, indep_params["job_class"])

            indep_params["addnl_files"] = wrapper_xml

            indep_params["addnl_jars"] = ",".join(
                [indep_params["wrapper_jar"]] + paths + builtin_paths)

        else:
            indep_params["addnl_jars"] = ",".join(paths)

        # All additional jars are passed with the --jars option
        if indep_params["addnl_jars"]:
            indep_params["addnl_jars"] = (" --jars " +
                                          indep_params["addnl_jars"])

        # Launch the spark job using spark-submit and deploy_mode = client
        # TODO(tmckay): we need to clean up wf_dirs on long running clusters
        # TODO(tmckay): probably allow for general options to spark-submit
        indep_params["args"] = updated_job_configs.get('args', [])
        indep_params["args"] = " ".join(
            [su.inject_swift_url_suffix(arg) for arg in indep_params["args"]])
        if indep_params.get("args"):
            indep_params["args"] = (" " + indep_params["args"])

        mutual_dict = self.plugin_params.copy()
        mutual_dict.update(indep_params)
        if mutual_dict.get("wrapper_jar"):
            # Substrings which may be empty have spaces
            # embedded if they are non-empty
            cmd = ('%(spark-user)s%(spark-submit)s%(driver-class-path)s'
                   ' --files %(addnl_files)s'
                   ' --class %(wrapper_class)s%(addnl_jars)s'
                   ' --master %(master)s'
                   ' --deploy-mode %(deploy-mode)s'
                   ' %(app_jar)s %(wrapper_args)s%(args)s') % dict(mutual_dict)
        else:
            cmd = ('%(spark-user)s%(spark-submit)s'
                   ' --class %(job_class)s%(addnl_jars)s'
                   ' --master %(master)s'
                   ' --deploy-mode %(deploy-mode)s'
                   ' %(app_jar)s%(args)s') % dict(mutual_dict)
        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)

        # If an exception is raised here, the job_manager will mark
        # the job failed and log the exception
        # The redirects of stdout and stderr will preserve output in the wf_dir
        with remote.get_remote(self.master) as r:
            # Upload the command launch script
            launch = os.path.join(wf_dir, "launch_command")
            r.write_file_to(launch, self._job_script())
            r.execute_command("chmod u+rwx,g+rx,o+rx %s" % wf_dir)
            r.execute_command("chmod +x %s" % launch)
            ret, stdout = r.execute_command(
                "cd %s; ./launch_command %s > /dev/null 2>&1 & echo $!" %
                (wf_dir, cmd))

        if ret == 0:
            # Success, we'll add the wf_dir in job_execution.extra and store
            # pid@instance_id as the job id
            # We know the job is running so return "RUNNING"
            return (stdout.strip() + "@" + self.master.id,
                    edp.JOB_STATUS_RUNNING, {
                        'spark-path': wf_dir
                    })

        # Hmm, no execption but something failed.
        # Since we're using backgrounding with redirect, this is unlikely.
        raise e.EDPError(
            _("Spark job execution failed. Exit status = "
              "%(status)s, stdout = %(stdout)s") % {
                  'status': ret,
                  'stdout': stdout
              })
Exemplo n.º 12
0
    def run_job(self, job_execution):
        ctx = context.ctx()
        job = conductor.job_get(ctx, job_execution.job_id)
        # This will be a dictionary of tuples, (native_url, runtime_url)
        # keyed by data_source id
        data_source_urls = {}
        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(job_execution.job_configs,
                                                     job_execution.id,
                                                     data_source_urls,
                                                     self.cluster))

        job_execution = conductor.job_execution_update(
            ctx, job_execution,
            {"data_source_urls": job_utils.to_url_dict(data_source_urls)})

        # Now that we've recorded the native urls, we can switch to the
        # runtime urls
        data_source_urls = job_utils.to_url_dict(data_source_urls,
                                                 runtime=True)

        for data_source in additional_sources:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(self.cluster, data_source)
                break

        # It is needed in case we are working with Spark plugin
        self.plugin_params['master'] = (self.plugin_params['master'] % {
            'host': self.master.hostname()
        })

        # TODO(tmckay): wf_dir should probably be configurable.
        # The only requirement is that the dir is writable by the image user
        wf_dir = job_utils.create_workflow_dir(self.master, '/tmp/spark-edp',
                                               job, job_execution.id, "700")
        paths, builtin_paths = self._upload_job_files(self.master, wf_dir, job,
                                                      updated_job_configs)

        # We can shorten the paths in this case since we'll run out of wf_dir
        paths = [
            os.path.basename(p) if p.startswith(wf_dir) else p for p in paths
        ]
        builtin_paths = [os.path.basename(p) for p in builtin_paths]

        cmd = self._build_command(wf_dir, paths, builtin_paths,
                                  updated_job_configs)

        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)

        # If an exception is raised here, the job_manager will mark
        # the job failed and log the exception
        # The redirects of stdout and stderr will preserve output in the wf_dir
        with remote.get_remote(self.master) as r:
            # Upload the command launch script
            launch = os.path.join(wf_dir, "launch_command")
            r.write_file_to(launch, self._job_script())
            r.execute_command("chmod u+rwx,g+rx,o+rx %s" % wf_dir)
            r.execute_command("chmod +x %s" % launch)
            ret, stdout = r.execute_command(
                "cd %s; ./launch_command %s > /dev/null 2>&1 & echo $!" %
                (wf_dir, cmd))

        if ret == 0:
            # Success, we'll add the wf_dir in job_execution.extra and store
            # pid@instance_id as the job id

            # We know the job is running so return "RUNNING"
            return (stdout.strip() + "@" + self.master.id,
                    edp.JOB_STATUS_RUNNING, {
                        'spark-path': wf_dir
                    })

        # Hmm, no execption but something failed.
        # Since we're using backgrounding with redirect, this is unlikely.
        raise e.EDPError(
            _("Spark job execution failed. Exit status = "
              "%(status)s, stdout = %(stdout)s") % {
                  'status': ret,
                  'stdout': stdout
              })
Exemplo n.º 13
0
    def run_job(self, job_execution):
        ctx = context.ctx()
        job = conductor.job_get(ctx, job_execution.job_id)

        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(
                job_execution.job_configs))

        for data_source in additional_sources:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(self.cluster, data_source)
                break

        # We'll always run the driver program on the master
        master = plugin_utils.get_instance(self.cluster, "master")

        # TODO(tmckay): wf_dir should probably be configurable.
        # The only requirement is that the dir is writable by the image user
        wf_dir = job_utils.create_workflow_dir(master, '/tmp/spark-edp', job,
                                               job_execution.id, "700")

        paths, builtin_paths = self._upload_job_files(master, wf_dir, job,
                                                      updated_job_configs)

        # We can shorten the paths in this case since we'll run out of wf_dir
        paths = [os.path.basename(p) for p in paths]
        builtin_paths = [os.path.basename(p) for p in builtin_paths]

        # TODO(tmckay): for now, paths[0] is always assumed to be the app
        # jar and we generate paths in order (mains, then libs).
        # When we have a Spark job type, we can require a "main" and set
        # the app jar explicitly to be "main"
        app_jar = paths.pop(0)
        job_class = updated_job_configs["configs"]["edp.java.main_class"]

        # If we uploaded builtins then we are using a wrapper jar. It will
        # be the first one on the builtin list and the original app_jar needs
        # to be added to the  'additional' jars
        if builtin_paths:
            wrapper_jar = builtin_paths.pop(0)
            wrapper_class = 'org.openstack.sahara.edp.SparkWrapper'
            wrapper_xml = self._upload_wrapper_xml(master, wf_dir,
                                                   updated_job_configs)
            wrapper_args = "%s %s" % (wrapper_xml, job_class)

            additional_jars = ",".join([app_jar] + paths + builtin_paths)

        else:
            wrapper_jar = wrapper_class = wrapper_args = ""
            additional_jars = ",".join(paths)

        # All additional jars are passed with the --jars option
        if additional_jars:
            additional_jars = " --jars " + additional_jars

        # Launch the spark job using spark-submit and deploy_mode = client
        host = master.hostname()
        port = c_helper.get_config_value("Spark", "Master port", self.cluster)
        spark_submit = os.path.join(
            c_helper.get_config_value("Spark", "Spark home", self.cluster),
            "bin/spark-submit")

        # TODO(tmckay): we need to clean up wf_dirs on long running clusters
        # TODO(tmckay): probably allow for general options to spark-submit
        args = updated_job_configs.get('args', [])
        args = " ".join([su.inject_swift_url_suffix(arg) for arg in args])
        if args:
            args = " " + args

        if wrapper_jar and wrapper_class:
            # Substrings which may be empty have spaces
            # embedded if they are non-empty
            cmd = ('%(spark_submit)s%(driver_cp)s'
                   ' --class %(wrapper_class)s%(addnl_jars)s'
                   ' --master spark://%(host)s:%(port)s'
                   ' %(wrapper_jar)s %(wrapper_args)s%(args)s') % (
                       {
                           "spark_submit": spark_submit,
                           "driver_cp": self.get_driver_classpath(),
                           "wrapper_class": wrapper_class,
                           "addnl_jars": additional_jars,
                           "host": host,
                           "port": port,
                           "wrapper_jar": wrapper_jar,
                           "wrapper_args": wrapper_args,
                           "args": args
                       })
        else:
            cmd = ('%(spark_submit)s --class %(job_class)s%(addnl_jars)s'
                   ' --master spark://%(host)s:%(port)s %(app_jar)s%(args)s'
                   ) % ({
                       "spark_submit": spark_submit,
                       "job_class": job_class,
                       "addnl_jars": additional_jars,
                       "host": host,
                       "port": port,
                       "app_jar": app_jar,
                       "args": args
                   })

        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)

        # If an exception is raised here, the job_manager will mark
        # the job failed and log the exception
        # The redirects of stdout and stderr will preserve output in the wf_dir
        with remote.get_remote(master) as r:
            # Upload the command launch script
            launch = os.path.join(wf_dir, "launch_command")
            r.write_file_to(launch, self._job_script())
            r.execute_command("chmod +x %s" % launch)
            ret, stdout = r.execute_command(
                "cd %s; ./launch_command %s > /dev/null 2>&1 & echo $!" %
                (wf_dir, cmd))

        if ret == 0:
            # Success, we'll add the wf_dir in job_execution.extra and store
            # pid@instance_id as the job id
            # We know the job is running so return "RUNNING"
            return (stdout.strip() + "@" + master.id, edp.JOB_STATUS_RUNNING, {
                'spark-path': wf_dir
            })

        # Hmm, no execption but something failed.
        # Since we're using backgrounding with redirect, this is unlikely.
        raise e.EDPError(
            _("Spark job execution failed. Exit status = "
              "%(status)s, stdout = %(stdout)s") % {
                  'status': ret,
                  'stdout': stdout
              })
Exemplo n.º 14
0
    def run_job(self, job_execution):
        ctx = context.ctx()
        job = conductor.job_get(ctx, job_execution.job_id)
        # This will be a dictionary of tuples, (native_url, runtime_url)
        # keyed by data_source id
        data_source_urls = {}
        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(job_execution.job_configs,
                                                     job_execution.id,
                                                     data_source_urls,
                                                     self.cluster)
        )

        job_execution = conductor.job_execution_update(
            ctx, job_execution,
            {"data_source_urls": job_utils.to_url_dict(data_source_urls)})

        # Now that we've recorded the native urls, we can switch to the
        # runtime urls
        data_source_urls = job_utils.to_url_dict(data_source_urls,
                                                 runtime=True)

        for data_source in additional_sources:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(self.cluster, data_source)
                break

        # It is needed in case we are working with Spark plugin
        self.plugin_params['master'] = (
            self.plugin_params['master'] % {'host': self.master.hostname()})

        # TODO(tmckay): wf_dir should probably be configurable.
        # The only requirement is that the dir is writable by the image user
        wf_dir = job_utils.create_workflow_dir(self.master, '/tmp/spark-edp',
                                               job, job_execution.id, "700")
        paths, builtin_paths = self._upload_job_files(
            self.master, wf_dir, job, updated_job_configs)

        # We can shorten the paths in this case since we'll run out of wf_dir
        paths = [os.path.basename(p) if p.startswith(wf_dir) else p
                 for p in paths]
        builtin_paths = [os.path.basename(p) for p in builtin_paths]

        cmd = self._build_command(wf_dir, paths, builtin_paths,
                                  updated_job_configs)

        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)

        # If an exception is raised here, the job_manager will mark
        # the job failed and log the exception
        # The redirects of stdout and stderr will preserve output in the wf_dir
        with remote.get_remote(self.master) as r:
            # Upload the command launch script
            launch = os.path.join(wf_dir, "launch_command")
            r.write_file_to(launch, self._job_script())
            r.execute_command("chmod u+rwx,g+rx,o+rx %s" % wf_dir)
            r.execute_command("chmod +x %s" % launch)
            ret, stdout = r.execute_command(
                "cd %s; ./launch_command %s > /dev/null 2>&1 & echo $!"
                % (wf_dir, cmd))

        if ret == 0:
            # Success, we'll add the wf_dir in job_execution.extra and store
            # pid@instance_id as the job id

            # We know the job is running so return "RUNNING"
            return (stdout.strip() + "@" + self.master.id,
                    edp.JOB_STATUS_RUNNING,
                    {'spark-path': wf_dir})

        # Hmm, no execption but something failed.
        # Since we're using backgrounding with redirect, this is unlikely.
        raise e.EDPError(_("Spark job execution failed. Exit status = "
                           "%(status)s, stdout = %(stdout)s") %
                         {'status': ret, 'stdout': stdout})
Exemplo n.º 15
0
    def run_job(self, job_execution):
        ctx = context.ctx()

        # This will be a dictionary of tuples, (native_url, runtime_url)
        # keyed by data_source id
        data_source_urls = {}

        job = conductor.job_get(ctx, job_execution.job_id)
        input_source, output_source = job_utils.get_data_sources(
            job_execution, job, data_source_urls, self.cluster)

        # Updated_job_configs will be a copy of job_execution.job_configs with
        # any name or uuid references to data_sources resolved to paths
        # assuming substitution is enabled.
        # If substitution is not enabled then updated_job_configs will
        # just be a reference to job_execution.job_configs to avoid a copy.
        # Additional_sources will be a list of any data_sources found.
        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(job_execution.job_configs,
                                                     job_execution.id,
                                                     data_source_urls,
                                                     self.cluster)
        )

        job_execution = conductor.job_execution_update(
            ctx, job_execution,
            {"data_source_urls": job_utils.to_url_dict(data_source_urls)})

        # Now that we've recorded the native urls, we can switch to the
        # runtime urls
        data_source_urls = job_utils.to_url_dict(data_source_urls,
                                                 runtime=True)

        proxy_configs = updated_job_configs.get('proxy_configs')
        configs = updated_job_configs.get('configs', {})
        use_hbase_lib = configs.get('edp.hbase_common_lib', {})

        # Extract all the 'oozie.' configs so that they can be set in the
        # job properties file. These are config values for Oozie itself,
        # not the job code
        oozie_params = {}
        for k in list(configs):
            if k.startswith('oozie.'):
                oozie_params[k] = configs[k]

        for data_source in [input_source, output_source] + additional_sources:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(
                    self.cluster, data_source_urls[data_source.id])
                break

        external_hdfs_urls = self._resolve_external_hdfs_urls(
            job_execution.job_configs)
        for url in external_hdfs_urls:
            h.configure_cluster_for_hdfs(self.cluster, url)

        hdfs_user = self.get_hdfs_user()

        # TODO(tmckay): this should probably be "get_namenode"
        # but that call does not exist in the oozie engine api now.
        oozie_server = self.get_oozie_server(self.cluster)

        wf_dir = self._create_hdfs_workflow_dir(oozie_server, job)
        self._upload_job_files_to_hdfs(oozie_server, wf_dir, job, configs,
                                       proxy_configs)

        wf_xml = workflow_factory.get_workflow_xml(
            job, self.cluster, updated_job_configs,
            input_source, output_source,
            hdfs_user, data_source_urls)

        path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir,
                                                      wf_xml, hdfs_user)

        job_params = self._get_oozie_job_params(hdfs_user,
                                                path_to_workflow,
                                                oozie_params,
                                                use_hbase_lib)

        client = self._get_client()
        oozie_job_id = client.add_job(x.create_hadoop_xml(job_params),
                                      job_execution)
        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)

        conductor.job_execution_update(
            context.ctx(), job_execution.id,
            {'info': {'status': edp.JOB_STATUS_READYTORUN},
             'engine_job_id': oozie_job_id})

        client.run_job(job_execution, oozie_job_id)
        try:
            status = client.get_job_info(job_execution, oozie_job_id)['status']
        except Exception:
            status = None
        return (oozie_job_id, status, None)
Exemplo n.º 16
0
    def run_job(self, job_execution):
        ctx = context.ctx()

        # This will be a dictionary of tuples, (native_url, runtime_url)
        # keyed by data_source id
        data_source_urls = {}

        job = conductor.job_get(ctx, job_execution.job_id)
        input_source, output_source = job_utils.get_data_sources(
            job_execution, job, data_source_urls, self.cluster)

        # Updated_job_configs will be a copy of job_execution.job_configs with
        # any name or uuid references to data_sources resolved to paths
        # assuming substitution is enabled.
        # If substitution is not enabled then updated_job_configs will
        # just be a reference to job_execution.job_configs to avoid a copy.
        # Additional_sources will be a list of any data_sources found.
        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(job_execution.job_configs,
                                                     job_execution.id,
                                                     data_source_urls,
                                                     self.cluster)
        )

        job_execution = conductor.job_execution_update(
            ctx, job_execution,
            {"data_source_urls": job_utils.to_url_dict(data_source_urls)})

        # Now that we've recorded the native urls, we can switch to the
        # runtime urls
        data_source_urls = job_utils.to_url_dict(data_source_urls,
                                                 runtime=True)

        proxy_configs = updated_job_configs.get('proxy_configs')
        configs = updated_job_configs.get('configs', {})
        use_hbase_lib = configs.get('edp.hbase_common_lib', {})

        # Extract all the 'oozie.' configs so that they can be set in the
        # job properties file. These are config values for Oozie itself,
        # not the job code
        oozie_params = {}
        for k in list(configs):
            if k.startswith('oozie.'):
                oozie_params[k] = configs[k]

        for data_source in [input_source, output_source] + additional_sources:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(
                    self.cluster, data_source_urls[data_source.id])
                break

        external_hdfs_urls = self._resolve_external_hdfs_urls(
            job_execution.job_configs)
        for url in external_hdfs_urls:
            h.configure_cluster_for_hdfs(self.cluster, url)

        hdfs_user = self.get_hdfs_user()

        # TODO(tmckay): this should probably be "get_namenode"
        # but that call does not exist in the oozie engine api now.
        oozie_server = self.get_oozie_server(self.cluster)

        wf_dir = self._create_hdfs_workflow_dir(oozie_server, job)
        self._upload_job_files_to_hdfs(oozie_server, wf_dir, job, configs,
                                       proxy_configs)

        wf_xml = workflow_factory.get_workflow_xml(
            job, self.cluster, updated_job_configs,
            input_source, output_source,
            hdfs_user, data_source_urls)

        path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir,
                                                      wf_xml, hdfs_user)

        job_params = self._get_oozie_job_params(hdfs_user,
                                                path_to_workflow,
                                                oozie_params,
                                                use_hbase_lib)

        client = self._get_client()
        oozie_job_id = client.add_job(x.create_hadoop_xml(job_params),
                                      job_execution)
        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)

        conductor.job_execution_update(
            context.ctx(), job_execution.id,
            {'info': {'status': edp.JOB_STATUS_READYTORUN},
             'engine_job_id': oozie_job_id})

        client.run_job(job_execution, oozie_job_id)
        try:
            status = client.get_job_status(job_execution,
                                           oozie_job_id)['status']
        except Exception:
            status = None
        return (oozie_job_id, status, None)
Exemplo n.º 17
0
    def _prepare_run_job(self, job_execution):
        ctx = context.ctx()

        # This will be a dictionary of tuples, (native_url, runtime_url)
        # keyed by data_source id
        data_source_urls = {}

        prepared_job_params = {}

        job = conductor.job_get(ctx, job_execution.job_id)
        input_source, output_source = job_utils.get_data_sources(
            job_execution, job, data_source_urls, self.cluster)

        # Updated_job_configs will be a copy of job_execution.job_configs with
        # any name or uuid references to data_sources resolved to paths
        # assuming substitution is enabled.
        # If substitution is not enabled then updated_job_configs will
        # just be a reference to job_execution.job_configs to avoid a copy.
        # Additional_sources will be a list of any data_sources found.
        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(job_execution.job_configs,
                                                     job_execution.id,
                                                     data_source_urls,
                                                     self.cluster)
        )

        job_execution = conductor.job_execution_update(
            ctx, job_execution,
            {"data_source_urls": job_utils.to_url_dict(data_source_urls)})

        # Now that we've recorded the native urls, we can switch to the
        # runtime urls
        data_source_urls = job_utils.to_url_dict(data_source_urls,
                                                 runtime=True)

        proxy_configs = updated_job_configs.get('proxy_configs')
        configs = updated_job_configs.get('configs', {})
        use_hbase_lib = configs.get('edp.hbase_common_lib', {})

        # Extract all the 'oozie.' configs so that they can be set in the
        # job properties file. These are config values for Oozie itself,
        # not the job code
        oozie_params = {}
        for k in list(configs):
            if k.startswith('oozie.'):
                oozie_params[k] = configs[k]

        for data_source in [input_source, output_source] + additional_sources:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(
                    self.cluster, data_source_urls[data_source.id])
                break

        external_hdfs_urls = self._resolve_external_hdfs_urls(
            job_execution.job_configs)
        for url in external_hdfs_urls:
            h.configure_cluster_for_hdfs(self.cluster, url)

        hdfs_user = self.get_hdfs_user()

        # TODO(tmckay): this should probably be "get_namenode"
        # but that call does not exist in the oozie engine api now.
        oozie_server = self.get_oozie_server(self.cluster)

        wf_dir = self._create_hdfs_workflow_dir(oozie_server, job)
        self._upload_job_files_to_hdfs(oozie_server, wf_dir, job, configs,
                                       proxy_configs)

        wf_xml = workflow_factory.get_workflow_xml(
            job, self.cluster, updated_job_configs,
            input_source, output_source,
            hdfs_user, data_source_urls)

        path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir,
                                                      wf_xml, hdfs_user)

        prepared_job_params['context'] = ctx
        prepared_job_params['hdfs_user'] = hdfs_user
        prepared_job_params['path_to_workflow'] = path_to_workflow
        prepared_job_params['use_hbase_lib'] = use_hbase_lib
        prepared_job_params['job_execution'] = job_execution
        prepared_job_params['oozie_params'] = oozie_params
        prepared_job_params['wf_dir'] = wf_dir
        prepared_job_params['oozie_server'] = oozie_server

        return prepared_job_params
Exemplo n.º 18
0
 def prepare_cluster(self, data_source, cluster, **kwargs):
     runtime_url = kwargs.pop('runtime_url')
     h.configure_cluster_for_hdfs(cluster, runtime_url)
Exemplo n.º 19
0
def run_job(job_execution_id):
    ctx = context.ctx()

    job_execution = conductor.job_execution_get(ctx,
                                                job_execution_id)

    cluster = conductor.cluster_get(ctx, job_execution.cluster_id)
    if cluster.status != 'Active':
        return

    if CONF.use_namespaces and not CONF.use_floating_ips:
        plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name)
        oozie = plugin.get_oozie_server(cluster)

        info = oozie.remote().get_neutron_info()
        extra = job_execution.extra.copy()
        extra['neutron'] = info

        job_execution = conductor.job_execution_update(ctx,
                                                       job_execution_id,
                                                       {'extra': extra})

    job = conductor.job_get(ctx, job_execution.job_id)
    if not edp.compare_job_type(job.type, edp.JOB_TYPE_JAVA):
        input_source = conductor.data_source_get(ctx,  job_execution.input_id)
        output_source = conductor.data_source_get(ctx, job_execution.output_id)
    else:
        input_source = None
        output_source = None

    for data_source in [input_source, output_source]:
        if data_source and data_source.type == 'hdfs':
            h.configure_cluster_for_hdfs(cluster, data_source)

    hdfs_user = _get_hdfs_user(cluster)
    oozie_server = _get_oozie_server(cluster)
    wf_dir = create_workflow_dir(oozie_server, job, hdfs_user)
    upload_job_files(oozie_server, wf_dir, job, hdfs_user)

    creator = workflow_factory.get_creator(job)

    wf_xml = creator.get_workflow_xml(cluster, job_execution,
                                      input_source, output_source)

    path_to_workflow = upload_workflow_file(oozie_server,
                                            wf_dir, wf_xml, hdfs_user)

    plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name)
    rm_path = plugin.get_resource_manager_uri(cluster)
    nn_path = plugin.get_name_node_uri(cluster)

    client = _create_oozie_client(cluster)
    job_parameters = {"jobTracker": rm_path,
                      "nameNode": nn_path,
                      "user.name": hdfs_user,
                      "oozie.wf.application.path":
                      "%s%s" % (nn_path, path_to_workflow),
                      "oozie.use.system.libpath": "true"}

    oozie_job_id = client.add_job(x.create_hadoop_xml(job_parameters),
                                  job_execution)
    job_execution = conductor.job_execution_update(ctx, job_execution,
                                                   {'oozie_job_id':
                                                    oozie_job_id,
                                                    'start_time':
                                                    datetime.datetime.now()})
    client.run_job(job_execution, oozie_job_id)
Exemplo n.º 20
0
    def run_job(self, job_execution):
        ctx = context.ctx()

        data_source_urls = {}

        job = conductor.job_get(ctx, job_execution.job_id)
        input_source, output_source = job_utils.get_data_sources(job_execution, job, data_source_urls)

        # Updated_job_configs will be a copy of job_execution.job_configs with
        # any name or uuid references to data_sources resolved to paths
        # assuming substitution is enabled.
        # If substitution is not enabled then updated_job_configs will
        # just be a reference to job_execution.job_configs to avoid a copy.
        # Additional_sources will be a list of any data_sources found.
        additional_sources, updated_job_configs = job_utils.resolve_data_source_references(
            job_execution.job_configs, job_execution.id, data_source_urls
        )

        job_execution = conductor.job_execution_update(ctx, job_execution, {"data_source_urls": data_source_urls})

        proxy_configs = updated_job_configs.get("proxy_configs")
        configs = updated_job_configs.get("configs", {})
        use_hbase_lib = configs.get("edp.hbase_common_lib", {})

        # Extract all the 'oozie.' configs so that they can be set in the
        # job properties file. These are config values for Oozie itself,
        # not the job code
        oozie_params = {}
        for k in list(configs):
            if k.startswith("oozie."):
                oozie_params[k] = configs[k]

        for data_source in [input_source, output_source] + additional_sources:
            if data_source and data_source.type == "hdfs":
                h.configure_cluster_for_hdfs(self.cluster, data_source_urls[data_source.id])
                break

        hdfs_user = self.get_hdfs_user()

        # TODO(tmckay): this should probably be "get_namenode"
        # but that call does not exist in the oozie engine api now.
        oozie_server = self.get_oozie_server(self.cluster)

        wf_dir = self._create_hdfs_workflow_dir(oozie_server, job)
        self._upload_job_files_to_hdfs(oozie_server, wf_dir, job, configs, proxy_configs)

        wf_xml = workflow_factory.get_workflow_xml(
            job, self.cluster, updated_job_configs, input_source, output_source, hdfs_user, data_source_urls
        )

        path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir, wf_xml, hdfs_user)

        job_params = self._get_oozie_job_params(hdfs_user, path_to_workflow, oozie_params, use_hbase_lib)

        client = self._get_client()
        oozie_job_id = client.add_job(x.create_hadoop_xml(job_params), job_execution)
        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info["status"] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)
        client.run_job(job_execution, oozie_job_id)
        try:
            status = client.get_job_status(job_execution, oozie_job_id)["status"]
        except Exception:
            status = None
        return (oozie_job_id, status, None)
Exemplo n.º 21
0
    def _prepare_run_job(self, job_execution):
        ctx = context.ctx()

        # This will be a dictionary of tuples, (native_url, runtime_url)
        # keyed by data_source id
        data_source_urls = {}

        prepared_job_params = {}

        job = conductor.job_get(ctx, job_execution.job_id)
        input_source, output_source = job_utils.get_data_sources(
            job_execution, job, data_source_urls, self.cluster)

        # Updated_job_configs will be a copy of job_execution.job_configs with
        # any name or uuid references to data_sources resolved to paths
        # assuming substitution is enabled.
        # If substitution is not enabled then updated_job_configs will
        # just be a reference to job_execution.job_configs to avoid a copy.
        # Additional_sources will be a list of any data_sources found.
        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(job_execution.job_configs,
                                                     job_execution.id,
                                                     data_source_urls,
                                                     self.cluster)
        )

        job_execution = conductor.job_execution_update(
            ctx, job_execution,
            {"data_source_urls": job_utils.to_url_dict(data_source_urls)})

        # Now that we've recorded the native urls, we can switch to the
        # runtime urls
        data_source_urls = job_utils.to_url_dict(data_source_urls,
                                                 runtime=True)

        proxy_configs = updated_job_configs.get('proxy_configs')
        configs = updated_job_configs.get('configs', {})
        use_hbase_lib = configs.get('edp.hbase_common_lib', {})

        # Extract all the 'oozie.' configs so that they can be set in the
        # job properties file. These are config values for Oozie itself,
        # not the job code
        oozie_params = {}
        for k in list(configs):
            if k.startswith('oozie.'):
                oozie_params[k] = configs[k]

        for data_source in [input_source, output_source] + additional_sources:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(
                    self.cluster, data_source_urls[data_source.id])
                break

        external_hdfs_urls = self._resolve_external_hdfs_urls(
            job_execution.job_configs)
        for url in external_hdfs_urls:
            h.configure_cluster_for_hdfs(self.cluster, url)

        hdfs_user = self.get_hdfs_user()

        # TODO(tmckay): this should probably be "get_namenode"
        # but that call does not exist in the oozie engine api now.
        oozie_server = self.get_oozie_server(self.cluster)

        wf_dir = self._create_hdfs_workflow_dir(oozie_server, job)
        self._upload_job_files_to_hdfs(oozie_server, wf_dir, job, configs,
                                       proxy_configs)

        wf_xml = workflow_factory.get_workflow_xml(
            job, self.cluster, updated_job_configs,
            input_source, output_source,
            hdfs_user, data_source_urls)

        path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir,
                                                      wf_xml, hdfs_user)

        prepared_job_params['context'] = ctx
        prepared_job_params['hdfs_user'] = hdfs_user
        prepared_job_params['path_to_workflow'] = path_to_workflow
        prepared_job_params['use_hbase_lib'] = use_hbase_lib
        prepared_job_params['job_execution'] = job_execution
        prepared_job_params['oozie_params'] = oozie_params
        prepared_job_params['wf_dir'] = wf_dir
        prepared_job_params['oozie_server'] = oozie_server

        return prepared_job_params