Python download_from_hdfs示例，graphlab.util.file_util.download_from_hdfs Python示例

示例#1

0

显示文件

文件： _job.py 项目： vandosant/flask-spike

    def _deserialize_output(self, task):
        """
        Deserialize the output from a task.

        Parameters
        ----------
        Task definition of interest.

        Returns
        -------
        The output of the run-time task associated with the task definition.
        """
        filepath = self._task_output_paths[task]

        non_hdfs_file_path = filepath

        # Unpickler has no support for passing in additional HADOOP_CONF_DIR
        # so we download HDFS folder first before calling to unpickler
        if _file_util.is_hdfs_path(filepath):
            non_hdfs_file_path = _make_temp_directory("job_output_")
            _file_util.download_from_hdfs(filepath, non_hdfs_file_path,
                hadoop_conf_dir=self.environment.hadoop_conf_dir, is_dir = True)

        unpickler = gl_pickle.GLUnpickler(non_hdfs_file_path)

        # We cannot delete this temporary file path becaue SFrame lazily load
        # the content from disk. But the temporary folder will be removed
        # eventually when the python session goes away

        return unpickler.load()

示例#2

0

显示文件

文件： _job.py 项目： divya2661/food-recommendation-engine-

    def _deserialize_output(self, task):
        """
        Deserialize the output from a task.

        Parameters
        ----------
        Task definition of interest.

        Returns
        -------
        The output of the run-time task associated with the task definition.
        """
        filepath = self._task_output_paths[task]

        non_hdfs_file_path = filepath

        # Unpickler has no support for passing in additional HADOOP_CONF_DIR
        # so we download HDFS folder first before calling to unpickler
        if _file_util.is_hdfs_path(filepath):
            non_hdfs_file_path = _make_temp_directory("job_output_")
            _file_util.download_from_hdfs(filepath, non_hdfs_file_path,
                hadoop_conf_dir=self.environment.hadoop_conf_dir, is_dir = True)

        unpickler = gl_pickle.GLUnpickler(non_hdfs_file_path)

        # We cannot delete this temporary file path becaue SFrame lazily load
        # the content from disk. But the temporary folder will be removed
        # eventually when the python session goes away

        return unpickler.load()

示例#3

0

显示文件

文件： _dml_cluster.py 项目： Mawul4j/Machine-Learning-Course

    def _read_commander_init_status_file(self):
        commander_file_path = self._get_commander_file_path()

        local_file_name = _tempfile.mktemp(prefix='dml_file_')
        try:
            if _file_util.is_hdfs_path(commander_file_path):
                _file_util.download_from_hdfs(
                    commander_file_path,
                    local_file_name,
                    hadoop_conf_dir = self.environment.hadoop_conf_dir)
            elif _file_util.is_s3_path(commander_file_path):
                _file_util.download_from_s3(
                    commander_file_path,
                    local_file_name,
                    aws_credentials = self.environment.get_credentials(),
                    silent = True)

            with open(local_file_name,'r') as f:
                status_json = _json.load(f)
                port = status_json['port']
                host_name = status_json['host_name']

            if port > 0:
                return 'http://%s:%s' % (host_name, port)
            else:
                return None
        except:
            # Ignore exception, we will fail after a few retry
            pass
        finally:
            if _os.path.exists(local_file_name):
                _os.remove(local_file_name)

示例#4

0

显示文件

文件： _policy.py 项目： Mawul4j/Machine-Learning-Course

    def _load_remote(cls, path, schema_version, aws_credentials={}):
        temp_dir = _gl.util._make_temp_filename(prefix='predictive_policy_')

        if _file_util.is_s3_path(path):
            _file_util.download_from_s3(path, temp_dir, is_dir=True,
                              aws_credentials=aws_credentials, silent=True)
        elif _file_util.is_hdfs_path(path):
            _file_util.download_from_hdfs(path, temp_dir, is_dir=True)
        else:
            assert 'Only support S3 and HDFS path for Predictive Object saving location!'

        return cls._load_local(temp_dir)

示例#5

0

显示文件

文件： _predictive_object.py 项目： Mawul4j/Machine-Learning-Course

    def _load_remote(cls, path, schema_version, aws_credentials={}):
        temp_dir = _tempfie.mkdtemp(prefix='predictive_object_')

        if fu.is_s3_path(path):
            fu.download_from_s3(path, temp_dir, is_dir=(schema_version > 2),
                              aws_credentials=aws_credentials)
        elif fu.is_hdfs_path(path):
            fu.download_from_hdfs(path, temp_dir, is_dir=(schema_version > 2))
        else:
            assert 'Only support S3 and HDFS path for Predictive Object saving location!'

        return cls._load_local(temp_dir)

示例#6

0

显示文件

文件： _job.py 项目： vandosant/flask-spike

    def _load_file_and_parse(self, file_name, parser_func, silent=False, test_url=True):
        '''
        Read remote file to a local temporary file, and use parser_func
        to parse the content, returns the parsed result.

        This function is used for parsing state and progress files from
        either local, S3 or HDFS.

        If there is any exception happened, returns None
        '''
        file_is_local = _file_util.is_local_path(file_name)
        local_file_name = file_name if file_is_local else _tempfile.mktemp(prefix='job-status-')

        try:
            try:
                if test_url and not self._test_url(file_name):
                    if not silent:
                        __LOGGER__.info("File %s is not available yet." % file_name)
                    return None

                if _file_util.is_hdfs_path(file_name):

                    _file_util.download_from_hdfs(
                        hdfs_path = file_name,
                        local_path = local_file_name,
                        hadoop_conf_dir=self.environment.hadoop_conf_dir)

                elif _file_util.is_s3_path(file_name):

                    _file_util.download_from_s3(
                        s3_path = file_name,
                        local_path = local_file_name,
                        is_dir = False,
                        aws_credentials = self.environment.ec2_config.get_credentials(),
                        silent = silent)

            except Exception as e:
                # It is ok the status file is not ready yet as the job is getting prepared
                if not silent:
                    __LOGGER__.warning("Exception encountered when trying to download file from %s, error: %s" % (file_name, e))
                return None

            try:
                # parse the local file
                return parser_func(local_file_name)
            except Exception as e:
                __LOGGER__.info("Exception when parsing file %s. Error: %s" % (file_name, e))
                return None
        finally:
            if (not file_is_local) and _os.path.exists(local_file_name):
                _os.remove(local_file_name)

示例#7

0

显示文件

文件： _job.py 项目： divya2661/food-recommendation-engine-

    def _load_file_and_parse(self, file_name, parser_func, silent=False, test_url=True):
        '''
        Read remote file to a local temporary file, and use parser_func
        to parse the content, returns the parsed result.

        This function is used for parsing state and progress files from
        either local, S3 or HDFS.

        If there is any exception happened, returns None
        '''
        file_is_local = _file_util.is_local_path(file_name)
        local_file_name = file_name if file_is_local else _tempfile.mktemp(prefix='job-status-')

        try:
            try:
                if test_url and not self._test_url(file_name):
                    if not silent:
                        __LOGGER__.info("File %s is not available yet." % file_name)
                    return None

                if _file_util.is_hdfs_path(file_name):

                    _file_util.download_from_hdfs(
                        hdfs_path = file_name,
                        local_path = local_file_name,
                        hadoop_conf_dir=self.environment.hadoop_conf_dir)

                elif _file_util.is_s3_path(file_name):

                    _file_util.download_from_s3(
                        s3_path = file_name,
                        local_path = local_file_name,
                        is_dir = False,
                        aws_credentials = self.environment.ec2_config.get_credentials(),
                        silent = silent)

            except Exception as e:
                # It is ok the status file is not ready yet as the job is getting prepared
                if not silent:
                    __LOGGER__.warning("Exception encountered when trying to download file from %s, error: %s" % (file_name, e))
                return None

            try:
                # parse the local file
                return parser_func(local_file_name)
            except Exception as e:
                __LOGGER__.info("Exception when parsing file %s. Error: %s" % (file_name, e))
                return None
        finally:
            if (not file_is_local) and _os.path.exists(local_file_name):
                _os.remove(local_file_name)

示例#8

0

显示文件

文件： predictive_service.py 项目： Mawul4j/Machine-Learning-Course

def _copy_predictive_object_files(source_path, target_path, is_dir, src_credentials, tgt_credentials):
    '''
    Copy either file or folder from source location to target location
    '''
    # Cleanup existing file path if exists
    if _file_util.is_local_path(target_path) and _os.path.exists(target_path):
        _shutil.rmtree(target_path)

    if _file_util.is_s3_path(source_path) and _file_util.is_s3_path(target_path):

        # compare credentials
        _check_aws_credentials(src_credentials, tgt_credentials, source_path)

        # intra s3 copy model
        _file_util.intra_s3_copy_model(source_path, target_path, is_dir, tgt_credentials)
    elif _file_util.is_local_path(source_path):

        _file_util.copy_from_local(source_path, target_path, is_dir = is_dir)

    else:
        tmp_dir = _tempfile.mkdtemp(prefix = 'copy_predictive_object')
        try:
            # download to local first
            local_path = _os.path.join(tmp_dir, 'temp_po_file')
            if _file_util.is_s3_path(source_path):
                _file_util.download_from_s3(
                    source_path,
                    local_path,
                    is_dir=is_dir,
                    aws_credentials=src_credentials,
                    silent=False)
            elif _file_util.is_hdfs_path(source_path):
                _file_util.download_from_hdfs(source_path, local_path, is_dir = False)
            else:
                raise RuntimeError('Unsupported file system type: %s' % source_path)

            # upload from local to remote
            if _file_util.is_s3_path(target_path):
                _file_util.upload_to_s3(local_path, target_path, is_dir=is_dir,
                    aws_credentials=tgt_credentials, silent=False)
            elif _file_util.is_hdfs_path(target_path):
                _file_util.hdfs_mkdir(target_path)
                _file_util.upload_to_hdfs(local_path, target_path, force=True, silent=False)
            else:
                _file_util.upload_to_local(local_path, target_path, is_dir=is_dir, silent=False)

        finally:
            _shutil.rmtree(tmp_dir)

示例#9

0

显示文件

文件： hadoop_cluster.py 项目： Mawul4j/Machine-Learning-Course

    def _read_cluster_state(self):
        local_cluster_config_file = _tempfile.mktemp(prefix='hadoop-conf-')
        try:
            remote_cluster_config_file = "%s%s" % (self.dato_dist_path, HadoopCluster._DIST_INI)

            if not _file_util.hdfs_test_url(remote_cluster_config_file, \
                                            hadoop_conf_dir = self.hadoop_conf_dir):
                raise ValueError('Path "%s" does not seem like a valid Dato Distributed '
                                 'installation.' % self.dato_dist_path)

            _file_util.download_from_hdfs(
                            hdfs_path = remote_cluster_config_file,
                            local_path = local_cluster_config_file,
                            hadoop_conf_dir=self.hadoop_conf_dir)

            config = _ConfigParser.ConfigParser()
            config.read(local_cluster_config_file)
            return config

        finally:
            if _os.path.exists(local_cluster_config_file):
                _os.remove(local_cluster_config_file)

示例#10

0

显示文件

文件： _job.py 项目： vandosant/flask-spike

    def _download_remote_folder_to_local(self, remote_path, silent=False):
        '''
        Download all files from remote path to local. Caller is responsible for
        cleaning up the local folder after finishing usage

        Returns the local temporary folder
        '''
        local_path = _tempfile.mkdtemp(prefix='job-results')

        try:
            if _file_util.is_hdfs_path(remote_path):

                _file_util.download_from_hdfs(
                    hdfs_path = remote_path,
                    local_path = local_path,
                    is_dir = True,
                    hadoop_conf_dir=self.environment.hadoop_conf_dir)

            elif _file_util.is_s3_path(remote_path):

                _file_util.download_from_s3(
                    s3_path = remote_path,
                    local_path = local_path,
                    is_dir = True,
                    aws_credentials = self.environment.ec2_config.get_credentials(),
                    silent = silent)
            else:
                raise RuntimeError("'%s' is not a supported remote path. Only S3 and HDFS"
                                    " remote path are supported" % remote_path)
        except:
            # Make sure we cleanup local files if we cannot successfully
            # download files
            if _os.path.isdir(local_path):
                _shutil.rmtree(local_path)

            raise

        return local_path

示例#11

0

显示文件

文件： _job.py 项目： divya2661/food-recommendation-engine-

    def _download_remote_folder_to_local(self, remote_path, silent=False):
        '''
        Download all files from remote path to local. Caller is responsible for
        cleaning up the local folder after finishing usage

        Returns the local temporary folder
        '''
        local_path = _tempfile.mkdtemp(prefix='job-results')

        try:
            if _file_util.is_hdfs_path(remote_path):

                _file_util.download_from_hdfs(
                    hdfs_path = remote_path,
                    local_path = local_path,
                    is_dir = True,
                    hadoop_conf_dir=self.environment.hadoop_conf_dir)

            elif _file_util.is_s3_path(remote_path):

                _file_util.download_from_s3(
                    s3_path = remote_path,
                    local_path = local_path,
                    is_dir = True,
                    aws_credentials = self.environment.ec2_config.get_credentials(),
                    silent = silent)
            else:
                raise RuntimeError("'%s' is not a supported remote path. Only S3 and HDFS"
                                    " remote path are supported" % remote_path)
        except:
            # Make sure we cleanup local files if we cannot successfully
            # download files
            if _os.path.isdir(local_path):
                _shutil.rmtree(local_path)

            raise

        return local_path

示例#12

0

显示文件

    def _read_cluster_state(self):
        local_cluster_config_file = _tempfile.mktemp(prefix='hadoop-conf-')
        try:
            remote_cluster_config_file = "%s%s" % (self.turi_dist_path,
                                                   HadoopCluster._DIST_INI)

            if not _file_util.hdfs_test_url(remote_cluster_config_file, \
                                            hadoop_conf_dir = self.hadoop_conf_dir):
                raise ValueError(
                    'Path "%s" does not seem like a valid Turi Distributed '
                    'installation.' % self.turi_dist_path)

            _file_util.download_from_hdfs(hdfs_path=remote_cluster_config_file,
                                          local_path=local_cluster_config_file,
                                          hadoop_conf_dir=self.hadoop_conf_dir)

            config = _ConfigParser.ConfigParser()
            config.read(local_cluster_config_file)
            return config

        finally:
            if _os.path.exists(local_cluster_config_file):
                _os.remove(local_cluster_config_file)