示例#1
0
    def _make_files(self, urlpath, **kwargs):
        from dask.bytes import open_files

        self._ensure_cache_dir()
        subdir = self._hash(urlpath)
        depth = self._spec['depth']
        files_in = []
        for i in range(1, depth + 1):
            files_in.extend(open_files('/'.join([urlpath] + ['*'] * i)))
        files_out = [
            open_files([self._path(f.path, subdir)], 'wb',
                       **self._storage_options)[0] for f in files_in
        ]
        files_in2, files_out2 = [], []
        paths = set(os.path.dirname(f.path) for f in files_in)
        for fin, fout in zip(files_in, files_out):
            if fin.path in paths:
                try:
                    os.makedirs(fout.path)
                except Exception:
                    pass
            else:
                files_in2.append(fin)
                files_out2.append(fout)
        return files_in2, files_out2
示例#2
0
    def changed(self):
        fns = (open_files(self.observable + '/*.yaml') +
               open_files(self.observable + '/*.yml'))

        modified = set(fn.path for fn in fns) != set(self._last_files)
        if modified:
            self.refresh()
        return any([modified] + [catalog.changed for catalog in self.catalogs])
示例#3
0
    def _make_files(self, urlpath, **kwargs):
        from dask.bytes import open_files

        self._ensure_cache_dir()
        subdir = self._hash(urlpath)
        files_in = open_files(urlpath, 'rb', **self._storage_options)
        files_out = [open_files([self._path(f.path, subdir)], 'wb',
                                **self._storage_options)[0]
                     for f in files_in]
        return files_in, files_out
示例#4
0
    def add(self, source, name=None, path=None, storage_options=None):
        """Add sources to the catalog and save into the original file

        This adds the source into the catalog dictionary, and saves the
        resulting catalog as YAML. Typically, this would be used to update a
        catalog file in-place. Optionally, the new catalog can be saved to a
        new location, in which case the new catalog is returned.

        Note that if a source of the given name exists, it will be clobbered.

        Parameters
        ----------
        source : DataSource instance
            The source whose spec we want to save
        name : str or None
            The name the source is to have in the catalog; use the source's
            name attribute, if not given.
        path : str or None
            Location to save the new catalog; if None, the original location
            from which it was loaded
        storage_options : dict or None
            If saving to a new location, use these arguments for the filesystem
            backend

        Returns
        -------
        YAMLFileCatalog instance, containing the new entry
        """
        import yaml
        entries = self._entries.copy()
        name = name or source.name or "source"
        entries[name] = source

        if path is None:
            options = self.storage_options or {}
            file_open = open_files([self.path], mode='wt', **options)
        else:
            options = storage_options or {}
            file_open = open_files([path], mode='wt', **options)
        assert len(file_open) == 1
        file_open = file_open[0]

        data = {'metadata': self.metadata, 'sources': {}}
        for e in entries:
            data['sources'][e] = list(
                entries[e]._yaml()['sources'].values())[0]
        with file_open as f:
            yaml.dump(data, f, default_flow_style=False)

        if path:
            return self
        else:
            return YAMLFileCatalog(path,
                                   storage_options=storage_options,
                                   autoreload=self.autoreload)
示例#5
0
文件: local.py 项目: vt100/intake
 def _load(self):
     # initial: find cat files
     # if flattening, need to get all entries from each.
     self._entries.clear()
     options = self.storage_options or {}
     if isinstance(self.path, (list, tuple)):
         files = sum(
             [open_files(p, mode='rb', **options) for p in self.path], [])
         self.name = self.name or "%i files" % len(files)
         self.description = self.description or f'Catalog generated from {len(files)} files'
         self.path = [make_path_posix(p) for p in self.path]
     else:
         if isinstance(self.path, str) and '*' not in self.path:
             self.path = self.path + '/*'
         files = open_files(self.path, mode='rb', **options)
         self.path = make_path_posix(self.path)
         self.name = self.name or self.path
         self.description = self.description or f'Catalog generated from all files found in {self.path}'
     if not set(f.path for f in files) == set(f.path
                                              for f in self._cat_files):
         # glob changed, reload all
         self._cat_files = files
         self._cats.clear()
     for f in files:
         name = os.path.split(f.path)[-1].replace('.yaml',
                                                  '').replace('.yml', '')
         kwargs = self.kwargs.copy()
         kwargs['path'] = f.path
         d = make_path_posix(os.path.dirname(f.path))
         if f.path not in self._cats:
             entry = LocalCatalogEntry(name, "YAML file: %s" % name,
                                       'yaml_file_cat', True, kwargs, [],
                                       {}, self.metadata, d)
             if self._flatten:
                 # store a concrete Catalog
                 try:
                     self._cats[f.path] = entry()
                 except IOError as e:
                     logger.info('Loading "%s" as a catalog failed: %s'
                                 '' % (entry, e))
             else:
                 # store a catalog entry
                 self._cats[f.path] = entry
     for name, entry in list(self._cats.items()):
         if self._flatten:
             entry.reload()
             inter = set(entry._entries).intersection(self._entries)
             if inter:
                 raise ValueError(
                     'Conflicting names when flattening multiple'
                     ' catalogs. Sources %s exist in more than'
                     ' one' % inter)
             self._entries.update(entry._entries)
         else:
             self._entries[entry._name] = entry
示例#6
0
    def _make_files(self, urlpath, **kwargs):
        import tempfile
        d = tempfile.mkdtemp()
        from dask.bytes import open_files

        self._ensure_cache_dir()
        self._urlpath = urlpath
        files_in = open_files(urlpath, 'rb')
        files_out = [open_files(
            [os.path.join(d, os.path.basename(f.path))], 'wb',
                                **self._storage_options)[0]
             for f in files_in]
        super(CompressedCache, self)._load(files_in, files_out, urlpath,
                                           meta=False)
        return files_in, files_out
示例#7
0
def read_orc(path, **kwargs):
    """ Read ORC files into a Dask DataFrame

    This calls the ``cudf.read_orc`` function on many ORC files.
    See that function for additional details.

    Examples
    --------
    >>> import dask_cudf
    >>> df = dask_cudf.read_orc("/path/to/*.orc")  # doctest: +SKIP

    See Also
    --------
    cudf.read_orc
    """

    name = "read-orc-" + tokenize(path, **kwargs)
    dsk = {}
    if "://" in str(path):
        files = open_files(path)

        # An `OpenFile` should be used in a Context
        with files[0] as f:
            meta = cudf.read_orc(f, **kwargs)

        dsk = {(name, i): (apply, _read_orc, [f], kwargs)
               for i, f in enumerate(files)}
    else:
        filenames = sorted(glob(str(path)))
        meta = cudf.read_orc(filenames[0], **kwargs)
        dsk = {(name, i): (apply, cudf.read_orc, [fn], kwargs)
               for i, fn in enumerate(filenames)}

    divisions = [None] * (len(dsk) + 1)
    return dd.core.new_dd_object(dsk, name, meta, divisions)
示例#8
0
    def _get_schema(self):
        from dask.bytes import open_files
        import dask.array as da
        if self._arr is None:
            path = self._get_cache(self.path)[0]

            files = open_files(path, 'rb', compression=None,
                               **self.storage)
            if self.shape is None:
                arr = NumpyAccess(files[0])
                self.shape = arr.shape
                self.dtype = arr.dtype
                arrs = [arr] + [NumpyAccess(f, self.shape, self.dtype)
                                for f in files[1:]]
            else:
                arrs = [NumpyAccess(f, self.shape, self.dtype)
                        for f in files]
            self.chunks = (self._chunks, ) + (-1, ) * (len(self.shape) - 1)
            self._arrs = [da.from_array(arr, self.chunks) for arr in arrs]

            if len(self._arrs) > 1:
                self._arr = da.stack(self._arrs)
            else:
                self._arr = self._arrs[0]
            self.chunks = self._arr.chunks
        return Schema(dtype=str(self.dtype), shape=self.shape,
                      extra_metadata=self.metadata,
                      npartitions=self._arr.npartitions,
                      chunks=self.chunks)
示例#9
0
文件: local.py 项目: stonebig/intake
    def __init__(self, path, getenv=True, getshell=True, storage_options=None):
        self._path = path

        # First, we load from YAML, failing if syntax errors are found
        options = storage_options or {}
        if hasattr(path, 'path') or hasattr(path, 'read'):
            file_open = path
            self._path = getattr(path, 'path', getattr(path, 'name', 'file'))
        else:
            file_open = open_files(self._path, mode='rb', **options)
            assert len(file_open) == 1
            file_open = file_open[0]
        if file_open.path.startswith('http'):
            # do not reload from HTTP
            self.token = file_open.path
        else:
            self.token = file_open.fs.ukey(file_open.path)
        self._name = os.path.splitext(os.path.basename(
            self._path))[0].replace('.', '_')
        self._dir = os.path.dirname(self._path)
        with file_open as f:
            text = f.read().decode()
        if "!template " in text:
            logger.warning("Use of '!template' deprecated - fixing")
            text = text.replace('!template ', '')
        try:
            data = yaml.load(text)
        except DuplicateKeyError as e:
            # Wrap internal exception with our own exception
            raise exceptions.DuplicateKeyError(e)

        if data is None:
            raise exceptions.CatalogException('No YAML data in file')
        # Second, we validate the schema and semantics
        context = dict(root=self._dir)
        result = CatalogParser(data, context=context, getenv=getenv,
                               getshell=getshell)
        if result.errors:
            errors = ["line {}, column {}: {}".format(*error)
                      for error in result.errors]
            raise exceptions.ValidationError(
                "Catalog '{}' has validation errors:\n\n{}"
                "".format(path, "\n".join(errors)), result.errors)

        cfg = result.data

        # Finally, we create the plugins and entries. Failure is still possible.
        params = dict(CATALOG_DIR=self._dir)

        self._plugins = {}
        for ps in cfg['plugin_sources']:
            ps.source = Template(ps.source).render(params)
            self._plugins.update(ps.load())

        self._entries = {}
        for entry in cfg['data_sources']:
            entry.find_plugin(self._plugins)
            self._entries[entry.name] = entry

        self.metadata = cfg.get('metadata', {})
示例#10
0
    def _load_metadata(self):
        import dask.dataframe as dd
        import dask.delayed
        from dask.bytes import open_files
        self.files = open_files(self.url, **self.storage_options)

        def read_a_file(open_file, reader, kwargs):
            with open_file as of:
                df = reader(of, **kwargs)
                df['path'] = open_file.path
                return df

        if self.dataframe is None:
            self.parts = [
                dask.delayed(read_a_file)(open_file, self.reader, self.kwargs)
                for open_file in self.files
            ]
            self.dataframe = dd.from_delayed(self.parts)
            self.npartitions = self.dataframe.npartitions
            self.shape = (None, len(self.dataframe.columns))
            self.dtype = self.dataframe.dtypes.to_dict()
            self._schema = Schema(npartitions=self.npartitions,
                                  extra_metadata=self.metadata,
                                  dtype=self.dtype,
                                  shape=self.shape,
                                  datashape=None)
        return self._schema
示例#11
0
文件: local.py 项目: vt100/intake
    def _load(self, reload=False):
        """Load text of fcatalog file and pass to parse

        Will do nothing if autoreload is off and reload is not explicitly
        requested
        """
        if self.autoreload or reload:
            # First, we load from YAML, failing if syntax errors are found
            options = self.storage_options or {}
            if hasattr(self.path, 'path') or hasattr(self.path, 'read'):
                file_open = self.path
                self.path = make_path_posix(
                    getattr(self.path, 'path',
                            getattr(self.path, 'name', 'file')))
            else:
                file_open = open_files(self.path, mode='rb', **options)
                assert len(file_open) == 1
                file_open = file_open[0]
            self._dir = get_dir(self.path)

            with file_open as f:
                text = f.read().decode()
            if "!template " in text:
                logger.warning("Use of '!template' deprecated - fixing")
                text = text.replace('!template ', '')
            self.parse(text)
示例#12
0
def test_complex_bytes(tempdir, comp, pars):
    dump, load, read = pars
    dump = import_name(dump)
    # using bytestrings means not needing extra en/decode argument to msgpack
    data = [{b'something': b'simple', b'and': 0}] * 2
    for f in ['1.out', '2.out']:
        fn = os.path.join(tempdir, f)
        with open_files([fn], mode='wb', compression=comp)[0] as fo:
            if read:
                fo.write(dump(data))
            else:
                dump(data, fo)
    # that was all setup

    path = os.path.join(tempdir, '*.out')
    t = TextFilesSource(path,
                        text_mode=False,
                        compression=comp,
                        decoder=load,
                        read=read)
    t.discover()
    assert t.npartitions == 2
    assert t._get_partition(0) == t.to_dask().to_delayed()[0].compute()
    out = t.read()
    assert isinstance(out, list)
    assert out[0] == data[0]
示例#13
0
    def _persist(source, path, encoder=None):
        """Save list to files using encoding

        encoder : None or one of str|json|pickle
            None is equivalent to str
        """
        import posixpath
        from dask.bytes import open_files
        import dask
        import pickle
        import json
        from intake.source.textfiles import TextFilesSource
        encoder = {
            None: str,
            'str': str,
            'json': json.dumps,
            'pickle': pickle.dumps
        }[encoder]
        try:
            b = source.to_dask()
        except NotImplementedError:
            import dask.bag as db
            b = db.from_sequence(source.read(), npartitions=1)
        files = open_files(posixpath.join(path, 'part.*'),
                           mode='wt',
                           num=b.npartitions)
        dwrite = dask.delayed(write_file)
        out = [
            dwrite(part, f, encoder) for part, f in zip(b.to_delayed(), files)
        ]
        dask.compute(out)
        s = TextFilesSource(posixpath.join(path, 'part.*'))
        return s
示例#14
0
def to_textfiles_binned(b,
                        path,
                        bin_size=64,
                        nbins=8,
                        compression="infer",
                        encoding=system_encoding,
                        compute=True,
                        storage_options=None,
                        last_endline=False,
                        **kwargs):

    mode = "wb" if encoding is None else "wt"
    files = open_files(path,
                       compression=compression,
                       mode=mode,
                       encoding=encoding,
                       name_function=file_namer(bin_size, nbins).name_function,
                       num=b.npartitions * nbins,
                       **(storage_options or {}))

    name = "to-textfiles-binned-" + uuid.uuid4().hex
    dsk = {(name, i): (_to_textfiles_chunk_binned, (b.name, i),
                       files[k:k + nbins], last_endline, bin_size)
           for i, k in enumerate(range(0, len(files), nbins))}
    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[b])
    out = type(b)(graph, name, b.npartitions)

    if compute:
        out.compute(**kwargs)
        return [f.path for f in files]
    else:
        return out.to_delayed()
示例#15
0
def to_json(df, url_path, orient='records', lines=None, storage_options=None,
            compute=True, encoding='utf-8', errors='strict',
            compression=None, **kwargs):
    """Write dataframe into JSON text files

    This utilises ``pandas.DataFrame.to_json()``, and most parameters are
    passed through - see its docstring.

    Differences: orient is 'records' by default, with lines=True; this
    produces the kind of JSON output that is most common in big-data
    applications, and which can be chunked when reading (see ``read_json()``).

    Parameters
    ----------
    df: dask.DataFrame
        Data to save
    url_path: str, list of str
        Location to write to. If a string, and there are more than one
        partitions in df, should include a glob character to expand into a
        set of file names, or provide a ``name_function=`` parameter.
        Supports protocol specifications such as ``"s3://"``.
    encoding, errors:
        The text encoding to implement, e.g., "utf-8" and how to respond
        to errors in the conversion (see ``str.encode()``).
    orient, lines, kwargs
        passed to pandas; if not specified, lines=True when orient='records',
        False otherwise.
    storage_options: dict
        Passed to backend file-system implementation
    compute: bool
        If true, immediately executes. If False, returns a set of delayed
        objects, which can be computed at a later time.
    encoding, errors:
        Text conversion, ``see str.encode()``
    compression : string or None
        String like 'gzip' or 'xz'.
    """
    if lines is None:
        lines = orient == 'records'
    if orient != 'records' and lines:
        raise ValueError('Line-delimited JSON is only available with'
                         'orient="records".')
    kwargs['orient'] = orient
    kwargs['lines'] = lines and orient == 'records'
    outfiles = open_files(
        url_path, 'wt', encoding=encoding,
        errors=errors,
        name_function=kwargs.pop('name_function', None),
        num=df.npartitions,
        compression=compression,
        **(storage_options or {})
    )
    parts = [dask.delayed(write_json_partition)(d, outfile, kwargs)
             for outfile, d in zip(outfiles, df.to_delayed())]
    if compute:
        dask.compute(parts)
        return [f.path for f in outfiles]
    else:
        return parts
示例#16
0
 def _get_schema(self):
     self._streams = open_files(self._urlpath, mode='rb')
     self.npartitions = len(self._streams)
     return base.Schema(datashape=None,
                        dtype=None,
                        shape=None,
                        npartitions=len(self._streams),
                        extra_metadata={})
示例#17
0
    def refresh(self):
        catalogs = []
        self.metadata.clear()
        self._last_files = []

        fns = (open_files(self.observable + '/*.yaml') +
               open_files(self.observable + '/*.yml'))
        for f in fns:
            try:
                self._last_files.append(f.path)
                catalogs.append(Catalog(f))
                self.metadata[f.path] = catalogs[-1].metadata
            except Exception as e:
                logger.warning("%s: %s" % (str(e), f))

        self.catalogs = catalogs
        children = {catalog.name: catalog for catalog in self.catalogs}

        return self.name, children, {}, []
示例#18
0
 def _load(self):
     # initial: find cat files
     # if flattening, need to get all entries from each.
     self._entries.clear()
     options = self.storage_options or {}
     if isinstance(self.path, (list, tuple)):
         files = sum(
             [open_files(p, mode='rb', **options) for p in self.path], [])
     else:
         if isinstance(self.path, str) and '*' not in self.path:
             self.path = self.path + '/*'
         files = open_files(self.path, mode='rb', **options)
     if not set(f.path for f in files) == set(f.path
                                              for f in self._cat_files):
         # glob changed, reload all
         self._cat_files = files
         self._cats.clear()
     for f in files:
         if os.path.isdir(f.path):
             # don't attempt to descend into directories
             continue
         name = os.path.split(f.path)[-1].replace('.yaml',
                                                  '').replace('.yml', '')
         kwargs = self.kwargs.copy()
         kwargs['path'] = f.path
         d = os.path.dirname(f.path)
         if f.path not in self._cats:
             entry = LocalCatalogEntry(name, "YAML file: %s" % name,
                                       'yaml_file_cat', True, kwargs, [],
                                       {}, self.metadata, d)
             if self._flatten:
                 # store a concrete Catalog
                 self._cats[f.path] = entry()
             else:
                 # store a catalog entry
                 self._cats[f.path] = entry
     for entry in self._cats.values():
         if self._flatten:
             entry.reload()
             self._entries.update(entry._entries)
         else:
             self._entries[entry._name] = entry
示例#19
0
    def _load(self):
        # First, we load from YAML, failing if syntax errors are found
        options = self.storage_options or {}
        if hasattr(self.path, 'path') or hasattr(self.path, 'read'):
            file_open = self.path
            self.path = getattr(self.path, 'path',
                                getattr(self.path, 'name', 'file'))
        else:
            file_open = open_files(self.path, mode='rb', **options)
            assert len(file_open) == 1
            file_open = file_open[0]
        self.name = os.path.splitext(os.path.basename(self.path))[0].replace(
            '.', '_')
        self._dir = get_dir(self.path)

        try:
            with file_open as f:
                text = f.read().decode()
        except (IOError, OSError):
            return
        if "!template " in text:
            logger.warning("Use of '!template' deprecated - fixing")
            text = text.replace('!template ', '')
        try:
            data = yaml.load(text)
        except DuplicateKeyError as e:
            # Wrap internal exception with our own exception
            raise exceptions.DuplicateKeyError(e)

        if data is None:
            raise exceptions.CatalogException('No YAML data in file')

        # Second, we validate the schema and semantics
        context = dict(root=self._dir)
        result = CatalogParser(data,
                               context=context,
                               getenv=self.getenv,
                               getshell=self.getshell)
        if result.errors:
            errors = [
                "line {}, column {}: {}".format(*error)
                for error in result.errors
            ]
            raise exceptions.ValidationError(
                "Catalog '{}' has validation errors:\n\n{}"
                "".format(self.path, "\n".join(errors)), result.errors)

        cfg = result.data

        self._entries = {}
        for entry in cfg['data_sources']:
            self._entries[entry.name] = entry

        self.metadata = cfg.get('metadata', {})
示例#20
0
 def _get_schema(self):
     from dask.bytes import open_files
     if self._files is None:
         self._files = open_files(self._urlpath,
                                  mode='rt',
                                  **self._storage_options)
         self.npartitions = len(self._files)
     return base.Schema(datashape=None,
                        dtype=None,
                        shape=(None, ),
                        npartitions=self.npartitions,
                        extra_metadata=self.metadata)
示例#21
0
    def __init__(self, urlpath, metadata=None):
        """Source to load Cisco Netflow packets as sequence of Python dicts.

        Parameters:
            urlpath : str
                Location of the data files; can include protocol and glob characters.
        """
        self._urlpath = urlpath
        self._streams = open_files(urlpath, mode='rb')

        super(NetflowSource, self).__init__(container='python',
                                            metadata=metadata)
示例#22
0
文件: base.py 项目: jhamman/intake
    def save(self, url, storage_options=None):
        """
        Output this catalog to a file as YAML

        Parameters
        ----------
        url : str
            Location to save to, perhaps remote
        storage_options : dict
            Extra arguments for the file-system
        """
        from dask.bytes import open_files
        with open_files([url], **(storage_options or {}), mode='wt')[0] as f:
            f.write(self.serialize())
示例#23
0
 def _get_schema(self):
     from dask.bytes import open_files
     import dask.array as da
     from dask.base import tokenize
     url = self._get_cache(self.url)[0]
     if self.arr is None:
         self.files = open_files(url, **self.storage_options)
         self.header, self.dtype, self.shape, self.wcs = _get_header(
             self.files[0], self.ext)
         name = 'fits-array-' + tokenize(url, self.chunks, self.ext)
         ch = self.chunks if self.chunks is not None else self.shape
         chunks = []
         for c, s in zip(ch, self.shape):
             num = s // c
             part = [c] * num
             if s % c:
                 part.append(s % c)
             chunks.append(tuple(part))
         cums = tuple((0, ) + tuple(accumulate(ch)) for ch in chunks)
         dask = {}
         if len(self.files) > 1:
             # multi-file set
             self.shape = (len(self.files), ) + self.shape
             chunks.insert(0, (1, ) * len(self.files))
             inds = tuple(range(len(ch)) for ch in chunks)
             for (fi, *bits) in product(*inds):
                 slices = tuple(slice(i[bit], i[bit + 1])
                                for (i, bit) in zip(cums, bits))
                 dask[(name, fi) + tuple(bits)] = (
                     _get_section, self.files[fi], self.ext, slices, False
                 )
         else:
             # single-file set
             inds = tuple(range(len(ch)) for ch in chunks)
             for bits in product(*inds):
                 slices = tuple(slice(i[bit], i[bit+1])
                                for (i, bit) in zip(cums, bits))
                 dask[(name,) + bits] = (
                     _get_section, self.files[0], self.ext, slices, True
                 )
         self.arr = da.Array(dask, name, chunks, dtype=self.dtype,
                             shape=self.shape)
         self._schema = Schema(
             dtype=self.dtype,
             shape=self.shape,
             extra_metadata=dict(self.header.items()),
             npartitions=self.arr.npartitions,
             chunks=self.arr.chunks
         )
     return self._schema
示例#24
0
    def _open_dataset(self):
        """
        Main entry function that finds a set of files and passes them to the
        reader.
        """
        from dask.bytes import open_files

        files = open_files(self.urlpath, **self.storage_options)
        if len(files) == 0:
            raise Exception("No files found at {}".format(self.urlpath))
        if len(files) == 1:
            self._ds = reader(files[0], self.chunks, **self._kwargs)
        else:
            self._ds = self._open_files(files)
示例#25
0
文件: base.py 项目: jhamman/intake
 def _persist(source, path, **kwargs):
     from intake.catalog.local import YAMLFileCatalog
     from dask.bytes.core import open_files
     import yaml
     out = {}
     for name in source:
         entry = source[name]
         out[name] = entry.__getstate__()
         out[name]['parameters'] = [
             up._captured_init_kwargs for up in entry._user_parameters
         ]
         out[name]['kwargs'].pop('parameters')
     fn = posixpath.join(path, 'cat.yaml')
     with open_files([fn], 'wt')[0] as f:
         yaml.dump({'sources': out}, f)
     return YAMLFileCatalog(fn)
示例#26
0
 def _data_to_source(cat, path, **kwargs):
     from intake.catalog.local import YAMLFileCatalog
     from dask.bytes.core import open_files
     import yaml
     if not isinstance(cat, Catalog):
         raise NotImplementedError
     out = {}
     for name in cat:
         entry = cat[name]
         out[name] = entry.__getstate__()
         out[name]['parameters'] = [up._captured_init_kwargs for up
                                    in entry._user_parameters]
         out[name]['kwargs'].pop('parameters')
     fn = posixpath.join(path, 'cat.yaml')
     with open_files([fn], 'wt')[0] as f:
         yaml.dump({'sources': out}, f)
     return YAMLFileCatalog(fn)
示例#27
0
    def _get_schema(self):
        from dask.bytes import open_files
        if self._files is None:

            urlpath = self._get_cache(self._urlpath)[0]

            self._files = open_files(urlpath,
                                     mode=self.mode,
                                     encoding=self.encoding,
                                     compression=self.compression,
                                     **self._storage_options)
            self.npartitions = len(self._files)
        return base.Schema(datashape=None,
                           dtype=None,
                           shape=(None, ),
                           npartitions=self.npartitions,
                           extra_metadata=self.metadata)
示例#28
0
    def _load(self, _, __, urlpath, meta=True):
        import subprocess
        from dask.bytes import open_files

        path = os.path.join(self._cache_dir, self._hash(urlpath))
        dat, part = os.path.split(urlpath)
        cmd = ['dat', 'clone', dat, path, '--no-watch']
        try:
            subprocess.call(cmd, stdout=subprocess.PIPE)
        except (IOError, OSError):  # pragma: no cover
            logger.info('Calling DAT failed')
            raise
        newpath = os.path.join(path, part)

        if meta:
            for of in open_files(newpath):
                self._log_metadata(urlpath, urlpath, of.path)
示例#29
0
    def _data_to_source(b, path, encoder=None, **kwargs):
        import dask.bag as db
        import posixpath
        from dask.bytes import open_files
        import dask
        from intake.source.textfiles import TextFilesSource
        if not hasattr(b, 'to_textfiles'):
            try:
                b = db.from_sequence(b, npartitions=1)
            except TypeError:
                raise NotImplementedError

        files = open_files(posixpath.join(path, 'part.*'), mode='wt',
                           num=b.npartitions)
        dwrite = dask.delayed(write_file)
        out = [dwrite(part, f, encoder)
               for part, f in zip(b.to_delayed(), files)]
        dask.compute(out)
        s = TextFilesSource(posixpath.join(path, 'part.*'))
        return s
示例#30
0
    def _get_schema(self):
        if self._df is None:
            from uavro import dask_read_avro
            from uavro.core import read_header
            from dask.bytes import open_files
            self._df = dask_read_avro(self._urlpath,
                                      blocksize=self._bs,
                                      storage_options=self._storage_options)

            files = open_files(self._urlpath, **self._storage_options)
            with copy.copy(files[0]) as f:
                # we assume the same header for all files
                self.metadata.update(read_header(f))
            self.npartitions = self._df.npartitions
        dtypes = {k: str(v) for k, v in self._df.dtypes.items()}
        return base.Schema(datashape=None,
                           dtype=dtypes,
                           shape=(None, len(dtypes)),
                           npartitions=self.npartitions,
                           extra_metadata={})
示例#31
0
def test_complex_text(tempdir, comp):
    dump, load, read = 'json.dumps', 'json.loads', True
    dump = import_name(dump)
    data = [{'something': 'simple', 'and': 0}] * 2
    for f in ['1.out', '2.out']:
        fn = os.path.join(tempdir, f)
        with open_files([fn], mode='wt', compression=comp)[0] as fo:
            if read:
                fo.write(dump(data))
            else:
                dump(data, fo)
    # that was all setup

    path = os.path.join(tempdir, '*.out')
    t = TextFilesSource(path, text_mode=True, compression=comp, decoder=load)
    t.discover()
    assert t.npartitions == 2
    assert t._get_partition(0) == t.to_dask().to_delayed()[0].compute()
    out = t.read()
    assert isinstance(out, list)
    assert out[0] == data[0]
示例#32
0
文件: json.py 项目: floriango/dask
def read_json(url_path, orient='records', lines=None, storage_options=None,
              blocksize=None, sample=2**20, encoding='utf-8', errors='strict',
              **kwargs):
    """Create a dataframe from a set of JSON files

    This utilises ``pandas.read_json()``, and most parameters are
    passed through - see its docstring.

    Differences: orient is 'records' by default, with lines=True; this
    is apropriate for line-delimited "JSON-lines" data, the kind of JSON output
    that is most common in big-data scenarios, and which can be chunked when
    reading (see ``read_json()``). All other options require blocksize=None,
    i.e., one partition per input file.


    Parameters
    ----------
    url_path: str, list of str
        Location to read from. If a string, can include a glob character to
        find a set of file names.
        Supports protocol specifications such as ``"s3://"``.
    encoding, errors:
        The text encoding to implement, e.g., "utf-8" and how to respond
        to errors in the conversion (see ``str.encode()``).
    orient, lines, kwargs
        passed to pandas; if not specified, lines=True when orient='records',
        False otherwise.
    storage_options: dict
        Passed to backend file-system implementation
    blocksize: None or int
        If None, files are not blocked, and you get one partition per input
        file. If int, which can only be used for line-delimited JSON files,
        each partition will be approximately this size in bytes, to the nearest
        newline character.
    sample: int
        Number of bytes to pre-load, to provide an empty dataframe structure
        to any blocks wihout data. Only relevant is using blocksize.
    encoding, errors:
        Text conversion, ``see bytes.decode()``

    Returns
    -------
    dask.DataFrame

    Examples
    --------
    Load single file

    >>> dd.read_json('myfile.1.json')  # doctest: +SKIP

    Load multiple files

    >>> dd.read_json('myfile.*.json')  # doctest: +SKIP

    >>> dd.read_json(['myfile.1.json', 'myfile.2.json'])  # doctest: +SKIP

    Load large line-delimited JSON files using partitions of approx
    256MB size

    >> dd.read_json('data/file*.csv', blocksize=2**28)
    """
    import dask.dataframe as dd
    if lines is None:
        lines = orient == 'records'
    if orient != 'records' and lines:
        raise ValueError('Line-delimited JSON is only available with'
                         'orient="records".')
    if blocksize and (orient != 'records' or not lines):
        raise ValueError("JSON file chunking only allowed for JSON-lines"
                         "input (orient='records', lines=True).")
    storage_options = storage_options or {}
    if blocksize:
        first, chunks = read_bytes(url_path, b'\n', blocksize=blocksize,
                                   sample=sample, **storage_options)
        chunks = list(dask.core.flatten(chunks))
        first = read_json_chunk(first, encoding, errors, kwargs)
        parts = [dask.delayed(read_json_chunk)(
            chunk, encoding, errors, kwargs, meta=first[:0]
        ) for chunk in chunks]

    else:
        files = open_files(url_path, 'rt', encoding=encoding, errors=errors,
                           **storage_options)
        parts = [dask.delayed(read_json_file)(f, orient, lines, kwargs)
                 for f in files]
    return dd.from_delayed(parts)