Exemplo n.º 1
0
    def _deserialize_output(self, task):
        """
        Deserialize the output from a task.

        Parameters
        ----------
        Task definition of interest.

        Returns
        -------
        The output of the run-time task associated with the task definition.
        """
        filepath = self._task_output_paths[task]

        non_hdfs_file_path = filepath

        # Unpickler has no support for passing in additional HADOOP_CONF_DIR
        # so we download HDFS folder first before calling to unpickler
        if _file_util.is_hdfs_path(filepath):
            non_hdfs_file_path = _make_temp_directory("job_output_")
            _file_util.download_from_hdfs(filepath, non_hdfs_file_path,
                hadoop_conf_dir=self.environment.hadoop_conf_dir, is_dir = True)

        unpickler = gl_pickle.GLUnpickler(non_hdfs_file_path)

        # We cannot delete this temporary file path becaue SFrame lazily load
        # the content from disk. But the temporary folder will be removed
        # eventually when the python session goes away

        return unpickler.load()
    def prepare_job_files(environment, job):
        '''
        Prepare all files needed to run a job in EC2 cluster
        '''

        # Update job working directory
        s3_job_path_root = Ec2ExecutionEnvironment.create_job_home_dir(
            environment, job.name)
        _job.Ec2Job._update_exec_dir(job, s3_job_path_root)

        __LOGGER__.info('Job working directory: %s' % s3_job_path_root)

        # Prepare all files locally and then upload to S3
        temp_local_folder = _make_temp_directory(prefix='ec2_job_')
        try:
            ExecutionEnvironment.prepare_job_exec_dir(job, temp_local_folder)

            _file_util.upload_to_s3(
                temp_local_folder,
                s3_job_path_root,
                is_dir=True,
                aws_credentials=environment.ec2_config.get_credentials(),
                silent=True)
        finally:
            _shutil.rmtree(temp_local_folder)
Exemplo n.º 3
0
    def _deserialize_output(self, task):
        """
        Deserialize the output from a task.

        Parameters
        ----------
        Task definition of interest.

        Returns
        -------
        The output of the run-time task associated with the task definition.
        """
        filepath = self._task_output_paths[task]

        non_hdfs_file_path = filepath

        # Unpickler has no support for passing in additional HADOOP_CONF_DIR
        # so we download HDFS folder first before calling to unpickler
        if _file_util.is_hdfs_path(filepath):
            non_hdfs_file_path = _make_temp_directory("job_output_")
            _file_util.download_from_hdfs(filepath, non_hdfs_file_path,
                hadoop_conf_dir=self.environment.hadoop_conf_dir, is_dir = True)

        unpickler = gl_pickle.GLUnpickler(non_hdfs_file_path)

        # We cannot delete this temporary file path becaue SFrame lazily load
        # the content from disk. But the temporary folder will be removed
        # eventually when the python session goes away

        return unpickler.load()
    def _get_dependent_file_path(cls):
        '''We put all dependent_files under a fixed path so that newer file may
        overwrite older files to guarantee the modules can be reloaded correctly
        '''

        if PredictiveObject._depedent_file_path is None:
            PredictiveObject._depedent_file_path = _make_temp_directory(prefix='predictive_object_files')
            sys.path.insert(1, PredictiveObject._depedent_file_path)

        return PredictiveObject._depedent_file_path
    def submit_yarn_application(environment,
                                job_working_dir,
                                silent=False,
                                native_conda=False):
        '''
        Submit a yarn application
        '''
        # Create temporary directory to write tmp files in for the job
        temp_local_directory = _make_temp_directory(prefix='hadoop_job_')

        req_file_path = None
        if hasattr(environment, 'additional_packages'):
            req_file_path = ExecutionEnvironment.write_required_packages(
                temp_local_directory, environment.additional_packages)

        # build actual hadoop command
        hadoop_cmd = HadoopExecutionEnvironment._build_hadoop_cmd(
            environment,
            job_working_dir,
            req_file_path,
            native_conda=native_conda)

        # call hadoop command
        if not silent:
            __LOGGER__.info(
                "Submitting job to Hadoop cluster using command= \n%s" %
                hadoop_cmd)

        proc = _subprocess.Popen(hadoop_cmd,
                                 shell=True,
                                 stderr=_subprocess.STDOUT,
                                 stdout=_subprocess.PIPE)

        app_info = HadoopExecutionEnvironment._parse_hadoop_cmd_output(
            proc, silent)

        if 'app_id' not in app_info or not app_info['app_id']:
            __LOGGER__.info(
                "Submitting job to Hadoop cluster using command= \n%s" %
                hadoop_cmd)
            raise RuntimeError(
                "Error submitting application or determining application id. Please confirm that you"
                " can correctly access the Hadoop environment specified (try running the above"
                " command from a terminal to see more diagnostic output).")

        else:
            _shutil.rmtree(temp_local_directory)

        return app_info
    def submit_yarn_application(environment, job_working_dir, silent = False,
        native_conda = False):
        '''
        Submit a yarn application
        '''
        # Create temporary directory to write tmp files in for the job
        temp_local_directory = _make_temp_directory(prefix='hadoop_job_')

        req_file_path = None
        if hasattr(environment, 'additional_packages'):
            req_file_path = ExecutionEnvironment.write_required_packages(
                temp_local_directory, environment.additional_packages)

        # build actual hadoop command
        hadoop_cmd = HadoopExecutionEnvironment._build_hadoop_cmd(
            environment,
            job_working_dir,
            req_file_path,
            native_conda = native_conda)

        # call hadoop command
        if not silent:
            __LOGGER__.info("Submitting job to Hadoop cluster using command= \n%s" % hadoop_cmd)

        proc = _subprocess.Popen(hadoop_cmd, shell=True,
                                          stderr=_subprocess.STDOUT,
                                          stdout=_subprocess.PIPE)

        app_info = HadoopExecutionEnvironment._parse_hadoop_cmd_output(proc, silent)

        if 'app_id' not in app_info or not app_info['app_id']:
            __LOGGER__.info("Submitting job to Hadoop cluster using command= \n%s" % hadoop_cmd)
            raise RuntimeError("Error submitting application or determining application id. Please confirm that you"
                               " can correctly access the Hadoop environment specified (try running the above"
                               " command from a terminal to see more diagnostic output).")

        else:
            _shutil.rmtree(temp_local_directory)

        return app_info
    def prepare_job_files(environment, job):
        '''
        Upload all job related information to HDFS so that it can be executed
        remotely
        '''
        exec_dir = HadoopExecutionEnvironment.create_job_home_dir(environment, job.name)
        _job.HadoopJob._update_exec_dir(job, exec_dir)

        logging.info("Job working directory: %s" % job._exec_dir)

        temp_job_folder = _make_temp_directory(prefix='hadoop_job_')
        try:
            ExecutionEnvironment.prepare_job_exec_dir(job, temp_job_folder)

            # Move everything to HDFS
            _file_util.upload_folder_to_hdfs(
                temp_job_folder,
                exec_dir,
                hadoop_conf_dir = environment.hadoop_conf_dir)

        finally:
            _shutil.rmtree(temp_job_folder)
    def prepare_job_files(environment, job):
        '''
        Upload all job related information to HDFS so that it can be executed
        remotely
        '''
        exec_dir = HadoopExecutionEnvironment.create_job_home_dir(
            environment, job.name)
        _job.HadoopJob._update_exec_dir(job, exec_dir)

        logging.info("Job working directory: %s" % job._exec_dir)

        temp_job_folder = _make_temp_directory(prefix='hadoop_job_')
        try:
            ExecutionEnvironment.prepare_job_exec_dir(job, temp_job_folder)

            # Move everything to HDFS
            _file_util.upload_folder_to_hdfs(
                temp_job_folder,
                exec_dir,
                hadoop_conf_dir=environment.hadoop_conf_dir)

        finally:
            _shutil.rmtree(temp_job_folder)
    def prepare_job_files(environment, job):
        '''
        Prepare all files needed to run a job in EC2 cluster
        '''

        # Update job working directory
        s3_job_path_root = Ec2ExecutionEnvironment.create_job_home_dir(environment, job.name)
        _job.Ec2Job._update_exec_dir(job, s3_job_path_root)

        __LOGGER__.info('Job working directory: %s' % s3_job_path_root)

        # Prepare all files locally and then upload to S3
        temp_local_folder = _make_temp_directory(prefix='ec2_job_')
        try:
            ExecutionEnvironment.prepare_job_exec_dir(job,  temp_local_folder)

            _file_util.upload_to_s3(
                temp_local_folder,
                s3_job_path_root,
                is_dir = True,
                aws_credentials = environment.ec2_config.get_credentials(),
                silent = True)
        finally:
                _shutil.rmtree(temp_local_folder)