Python get_parallel_reader示例，thunder.readers.get_parallel_reader Python示例

示例#1

0

显示文件

文件： readers.py 项目： techchrj/thunder

def frombinary(path, ext='bin', conf='conf.json', dtype=None, shape=None, skip=0, index=None, labels=None, engine=None, credentials=None):
    """
    Load series data from flat binary files.

    Parameters
    ----------
    path : string URI or local filesystem path
        Directory to load from, can be a URI string with scheme
        (e.g. 'file://', 's3n://', or 'gs://'), or a single file,
        or a directory, or a directory with a single wildcard character.

    ext : str, optional, default = 'bin'
        Optional file extension specifier.

    conf : str, optional, default = 'conf.json'
        Name of conf file with type and size information.

    dtype : dtype or dtype specifier, default 'float64'
        Numerical type to use for data after converting from text.

    shape : tuple or list, optional, default = None
        Shape of data if known, will be inferred otherwise.

    skip : int, optional, default = 0
        Number of items in each record to skip.

    index : array, optional, default = None
        Index for records, if not provided will use (0, 1, ...)

    labels : array, optional, default = None
        Labels for records. If provided, should have shape of shape[:-1].

    engine : object, default = None
        Computational engine (e.g. a SparkContext for Spark)

    credentials : dict, default = None
        Credentials for remote storage (e.g. S3) in the form {access: ***, secret: ***}
    """
    shape, dtype = _binaryconfig(path, conf, dtype, shape, credentials)

    from thunder.readers import normalize_scheme, get_parallel_reader
    path = normalize_scheme(path, ext)

    from numpy import dtype as dtype_func
    nelements = shape[-1] + skip
    recordsize = dtype_func(dtype).itemsize * nelements

    if spark and isinstance(engine, spark):
        lines = engine.binaryRecords(path, recordsize)
        raw = lines.map(lambda x: frombuffer(buffer(x), offset=0, count=nelements, dtype=dtype)[skip:])

        def switch(record):
            ary, idx = record
            return (idx,), ary

        rdd = raw.zipWithIndex().map(switch)

        if shape and len(shape) > 2:
            expand = lambda k: unravel_index(k[0], shape[0:-1])
            rdd = rdd.map(lambda kv: (expand(kv[0]), kv[1]))

        if not index:
            index = arange(shape[-1])

        return fromrdd(rdd, dtype=dtype, shape=shape, index=index)

    else:
        reader = get_parallel_reader(path)(engine, credentials=credentials)
        data = reader.read(path, ext=ext)

        values = []
        for record in data:
            buf = record[1]
            offset = 0
            while offset < len(buf):
                v = frombuffer(buffer(buf), offset=offset, count=nelements, dtype=dtype)
                values.append(v[skip:])
                offset += recordsize

        if not len(values) == prod(shape[0:-1]):
            raise ValueError('Unexpected shape, got %g records but expected %g'
                             % (len(values), prod(shape[0:-1])))

        values = asarray(values, dtype=dtype)

        if shape:
            values = values.reshape(shape)

        return fromarray(values, index=index, labels=labels)

示例#2

0

显示文件

文件： readers.py 项目： techchrj/thunder

def fromtext(path, ext='txt', dtype='float64', skip=0, shape=None, index=None, labels=None, npartitions=None, engine=None, credentials=None):
    """
    Loads series data from text files.

    Assumes data are formatted as rows, where each record is a row
    of numbers separated by spaces e.g. 'v v v v v'. You can
    optionally specify a fixed number of initial items per row to skip / discard.

    Parameters
    ----------
    path : string
        Directory to load from, can be a URI string with scheme
        (e.g. 'file://', 's3n://', or 'gs://'), or a single file,
        or a directory, or a directory with a single wildcard character.

    ext : str, optional, default = 'txt'
        File extension.

    dtype : dtype or dtype specifier, default 'float64'
        Numerical type to use for data after converting from text.

    skip : int, optional, default = 0
        Number of items in each record to skip.

    shape : tuple or list, optional, default = None
        Shape of data if known, will be inferred otherwise.

    index : array, optional, default = None
        Index for records, if not provided will use (0, 1, ...)

    labels : array, optional, default = None
        Labels for records. If provided, should have length equal to number of rows.

    npartitions : int, default = None
        Number of partitions for parallelization (Spark only)

    engine : object, default = None
        Computational engine (e.g. a SparkContext for Spark)

    credentials : dict, default = None
        Credentials for remote storage (e.g. S3) in the form {access: ***, secret: ***}
    """
    from thunder.readers import normalize_scheme, get_parallel_reader
    path = normalize_scheme(path, ext)

    if spark and isinstance(engine, spark):

        def parse(line, skip):
            vec = [float(x) for x in line.split(' ')]
            return array(vec[skip:], dtype=dtype)

        lines = engine.textFile(path, npartitions)
        data = lines.map(lambda x: parse(x, skip))

        def switch(record):
            ary, idx = record
            return (idx,), ary

        rdd = data.zipWithIndex().map(switch)
        return fromrdd(rdd, dtype=str(dtype), shape=shape, index=index)

    else:
        reader = get_parallel_reader(path)(engine, credentials=credentials)
        data = reader.read(path, ext=ext)

        values = []
        for kv in data:
            for line in str(kv[1].decode('utf-8')).split('\n')[:-1]:
                values.append(fromstring(line, sep=' '))
        values = asarray(values)

        if skip > 0:
            values = values[:, skip:]

        if shape:
            values = values.reshape(shape)

        return fromarray(values, index=index, labels=labels)

示例#3

0

显示文件

def frompath(path,
             accessor=None,
             ext=None,
             start=None,
             stop=None,
             recursive=False,
             npartitions=None,
             dims=None,
             dtype=None,
             labels=None,
             recount=False,
             engine=None,
             credentials=None):
    """
    Load images from a path using the given accessor.

    Supports both local and remote filesystems.

    Parameters
    ----------
    accessor : function
        Apply to each item after loading to yield an image.

    ext : str, optional, default=None
        File extension.

    npartitions : int, optional, default=None
        Number of partitions for computational engine,
        if None will use default for engine.

    dims : tuple, optional, default=None
        Dimensions of images.

    dtype : str, optional, default=None
        Numerical type of images.

    labels : array, optional, default = None
        Labels for records. If provided, should be one-dimensional.

    start, stop : nonnegative int, optional, default=None
        Indices of files to load, interpreted using Python slicing conventions.

    recursive : boolean, optional, default=False
        If true, will recursively descend directories from path, loading all files
        with an extension matching 'ext'.

    recount : boolean, optional, default=False
        Force subsequent record counting.
    """
    from thunder.readers import get_parallel_reader
    reader = get_parallel_reader(path)(engine, credentials=credentials)
    data = reader.read(path,
                       ext=ext,
                       start=start,
                       stop=stop,
                       recursive=recursive,
                       npartitions=npartitions)

    if spark and isinstance(engine, spark):
        if accessor:
            data = data.flatMap(accessor)
        if recount:
            nrecords = None

            def switch(record):
                ary, idx = record
                return (idx, ), ary

            data = data.values().zipWithIndex().map(switch)
        else:
            nrecords = reader.nfiles
        return fromrdd(data,
                       nrecords=nrecords,
                       dims=dims,
                       dtype=dtype,
                       labels=labels,
                       ordered=True)

    else:
        if accessor:
            data = [accessor(d) for d in data]
        flattened = list(itertools.chain(*data))
        values = [kv[1] for kv in flattened]
        return fromarray(values, labels=labels)

示例#4

0

显示文件

文件： readers.py 项目： d-v-b/thunder

def frompath(path, accessor=None, ext=None, start=None, stop=None, recursive=False, npartitions=None, dims=None, dtype=None, labels=None, recount=False, engine=None, credentials=None):
    """
    Load images from a path using the given accessor.

    Supports both local and remote filesystems.

    Parameters
    ----------
    accessor : function
        Apply to each item after loading to yield an image.

    ext : str, optional, default=None
        File extension.

    npartitions : int, optional, default=None
        Number of partitions for computational engine,
        if None will use default for engine.

    dims : tuple, optional, default=None
        Dimensions of images.

    dtype : str, optional, default=None
        Numerical type of images.

    labels : array, optional, default = None
        Labels for records. If provided, should be one-dimensional.

    start, stop : nonnegative int, optional, default=None
        Indices of files to load, interpreted using Python slicing conventions.

    recursive : boolean, optional, default=False
        If true, will recursively descend directories from path, loading all files
        with an extension matching 'ext'.

    recount : boolean, optional, default=False
        Force subsequent record counting.
    """
    from thunder.readers import get_parallel_reader
    reader = get_parallel_reader(path)(engine, credentials=credentials)
    data = reader.read(path, ext=ext, start=start, stop=stop,
                       recursive=recursive, npartitions=npartitions)

    if spark and isinstance(engine, spark):
        if accessor:
            data = data.flatMap(accessor)
        if recount:
            nrecords = None

            def switch(record):
                ary, idx = record
                return (idx,), ary

            data = data.values().zipWithIndex().map(switch)
        else:
            nrecords = reader.nfiles
        return fromrdd(data, nrecords=nrecords, dims=dims, dtype=dtype, labels=labels, ordered=True)

    else:
        if accessor:
            data = [accessor(d) for d in data]
        flattened = list(itertools.chain(*data))
        values = [kv[1] for kv in flattened]
        return fromarray(values, labels=labels)

示例#5

0

显示文件

文件： imageSequentialClustering.py 项目： oadekoya/sparkforImageProcessing

IMG_PATH ='/images/b2'
MODEL_PATH='/k_model'
FEATURE_PATH = '/xtract_feature'
SERVER = "127.0.0.1"
U_NAME = "***********"
PASSWORD = "******"

LOCAL_PATH = "/home/amit/A1/b2"
sc =  spark.sparkContext
sqlContext = SQLContext(sc)
#data=images.frompng('/home/amit/A1',npartitions=8, engine=sc)


from thunder.readers import get_parallel_reader, FileNotFoundError
reader = get_parallel_reader(IMG_PATH)(sc)
#data = reader.read(IMG_PATH, recursive=True, npartitions=8)
from scipy.misc import imread

ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())

ssh.connect(SERVER, username=U_NAME, password=PASSWORD)
ftp = ssh.open_sftp()

def readlocal(path, offset=None, size=-1):
    """
    Wrapper around open(path, 'rb') that returns the contents of the file as a string.
    Will rethrow FileNotFoundError if it receives an IOError.
    """
    #print(path)