def __call__(self, task): # Determine the path etc here updated_kwargs = { i: self.target_kwargs[i] for i in self.target_kwargs if i != 'glob' } if 'glob' in self.target_kwargs: target_path = self.file_pattern.format(task=task) updated_glob = self.target_kwargs['glob'].format( task=task) + self.ext.format(task=task) updated_kwargs['glob'] = new_glob else: target_path = (self.file_pattern.format(task=task) + self.ext.format(task=task)) path_sep = get_fs_token_paths(target_path)[0].sep if target_path[-1] != path_sep: if target_path[-1] == "/": target_path = target_path[:-1] target_path = target_path + path_sep fs, _, _ = get_fs_token_paths(target_path) return self.target_class(target_path, **updated_kwargs)
def __call__(self, task): # Use either ext to specify extensions or no ext in which case custom params go to revised_kwargs = { i: self.target_kwargs[i] for i in self.target_kwargs if i != "ext" } # If glob is specified, if "glob" in self.target_kwargs: target_path = self.file_pattern.format(task=task) revised_glob = self.target_kwargs["glob"].format( task=task) + self.ext.format(task=task) revised_kwargs["glob"] = revised_glob else: target_path = self.file_pattern.format( task=task) + self.ext.format(task=task) # Note that these targets force you to specify directory datasets with an ending /; Dask (annoyingly) is # inconsistent on this, so you may find yourself manipulating paths inside ParquetTarget and CSVTarget # differently. The user of these targets should not need to worry about these details! path_sep = get_fs_token_paths(target_path)[0].sep if target_path[-1] != path_sep: if target_path[-1] == "/": target_path = target_path[:-1] target_path = target_path + path_sep fs, _, _ = get_fs_token_paths(target_path) if "{ext}" not in self.file_pattern and not "" == self.ext: target_path = target_path + self.ext return self.target_class(target_path, **revised_kwargs)
def test_urlpath_expand_read(): """Make sure * is expanded in file paths when reading.""" # when reading, globs should be expanded to read files by mask with filetexts(csv_files, mode='b'): _, _, paths = get_fs_token_paths('.*.csv') assert len(paths) == 2 _, _, paths = get_fs_token_paths(['.*.csv']) assert len(paths) == 2
def test_urlpath_expand_read(): """Make sure * is expanded in file paths when reading.""" # when reading, globs should be expanded to read files by mask with filetexts(csv_files, mode='b'): _, _, paths = get_fs_token_paths('.*.csv') assert len(paths) == 2 _, _, paths = get_fs_token_paths(['.*.csv']) assert len(paths) == 2
def test_urlpath_expand_write(): """Make sure * is expanded in file paths when writing.""" _, _, paths = get_fs_token_paths('prefix-*.csv', mode='wb', num=2) assert paths == ['prefix-0.csv', 'prefix-1.csv'] _, _, paths = get_fs_token_paths(['prefix-*.csv'], mode='wb', num=2) assert paths == ['prefix-0.csv', 'prefix-1.csv'] # we can read with multiple masks, but not write with pytest.raises(ValueError): _, _, paths = get_fs_token_paths(['prefix1-*.csv', 'prefix2-*.csv'], mode='wb', num=2)
def test_urlpath_expand_write(): """Make sure * is expanded in file paths when writing.""" _, _, paths = get_fs_token_paths('prefix-*.csv', mode='wb', num=2) assert paths == ['prefix-0.csv', 'prefix-1.csv'] _, _, paths = get_fs_token_paths(['prefix-*.csv'], mode='wb', num=2) assert paths == ['prefix-0.csv', 'prefix-1.csv'] # we can read with multiple masks, but not write with pytest.raises(ValueError): _, _, paths = get_fs_token_paths(['prefix1-*.csv', 'prefix2-*.csv'], mode='wb', num=2)
def test_urlpath_expand_write(): """Make sure * is expanded in file paths when writing.""" _, _, paths = get_fs_token_paths("prefix-*.csv", mode="wb", num=2) assert [p.endswith(pa) for p, pa in zip(paths, ["prefix-0.csv", "prefix-1.csv"])] _, _, paths = get_fs_token_paths(["prefix-*.csv"], mode="wb", num=2) assert [p.endswith(pa) for p, pa in zip(paths, ["prefix-0.csv", "prefix-1.csv"])] # we can read with multiple masks, but not write with pytest.raises(ValueError): _, _, paths = get_fs_token_paths( ["prefix1-*.csv", "prefix2-*.csv"], mode="wb", num=2 )
def test_urlpath_inference_errors(): # Empty list with pytest.raises(ValueError) as err: get_fs_token_paths([]) assert 'empty' in str(err) # Protocols differ with pytest.raises(ValueError) as err: get_fs_token_paths(['s3://test/path.csv', '/other/path.csv']) assert 'same protocol and options' in str(err) # Options differ with pytest.raises(ValueError) as err: get_fs_token_paths([ 'hdfs://[email protected]/test/path.csv', 'hdfs://[email protected]/other/path.csv' ]) assert 'same protocol and options' in str(err) # Unknown type with pytest.raises(TypeError): get_fs_token_paths({ 'sets/are.csv', 'unordered/so/they.csv', 'should/not/be.csv' 'allowed.csv' })
def test_urlpath_inference_errors(): # Empty list with pytest.raises(ValueError) as err: get_fs_token_paths([]) assert "empty" in str(err) # Protocols differ with pytest.raises(ValueError) as err: get_fs_token_paths(["s3://test/path.csv", "/other/path.csv"]) assert "same protocol and options" in str(err) # Options differ with pytest.raises(ValueError) as err: get_fs_token_paths( [ "hdfs://[email protected]/test/path.csv", "hdfs://[email protected]/other/path.csv", ] ) assert "same protocol and options" in str(err) # Unknown type with pytest.raises(TypeError): get_fs_token_paths( {"sets/are.csv", "unordered/so/they.csv", "should/not/be.csv" "allowed.csv"} )
def test_urlpath_inference_errors(): # Empty list with pytest.raises(ValueError, match="empty"): get_fs_token_paths([]) # Protocols differ with pytest.raises(ValueError, match="the same protocol"): get_fs_token_paths(["s3://test/path.csv", "/other/path.csv"]) # Options differ with pytest.raises(ValueError, match="the same file-system options"): get_fs_token_paths( [ "ftp://[email protected]/test/path.csv", "ftp://[email protected]/other/path.csv", ] ) # Unknown type with pytest.raises(TypeError): get_fs_token_paths( { "sets/are.csv", "unordered/so/they.csv", "should/not/be.csv", "allowed.csv", } )
def fs(self): """ Filesystem from a urlpath and options. """ fs, token, paths = get_fs_token_paths( self.path, storage_options=self.storage_options) return fs
def test_urlpath_inference_strips_protocol(tmpdir): tmpdir = str(tmpdir) paths = [os.path.join(tmpdir, 'test.%02d.csv' % i) for i in range(20)] for path in paths: with open(path, 'wb') as f: f.write(b'1,2,3\n' * 10) # globstring protocol = 'file:///' if sys.platform == 'win32' else 'file://' urlpath = protocol + os.path.join(tmpdir, 'test.*.csv') _, _, paths2 = get_fs_token_paths(urlpath) assert paths2 == paths # list of paths _, _, paths2 = get_fs_token_paths([protocol + p for p in paths]) assert paths2 == paths
def test_urlpath_inference_strips_protocol(tmpdir): tmpdir = str(tmpdir) paths = [os.path.join(tmpdir, 'test.%02d.csv' % i) for i in range(20)] for path in paths: with open(path, 'wb') as f: f.write(b'1,2,3\n' * 10) # globstring protocol = 'file:///' if sys.platform == 'win32' else 'file://' urlpath = protocol + os.path.join(tmpdir, 'test.*.csv') _, _, paths2 = get_fs_token_paths(urlpath) assert paths2 == paths # list of paths _, _, paths2 = get_fs_token_paths([protocol + p for p in paths]) assert paths2 == paths
def test_urlpath_inference_strips_protocol(tmpdir): tmpdir = str(tmpdir) paths = [os.path.join(tmpdir, "test.%02d.csv" % i) for i in range(20)] for path in paths: with open(path, "wb") as f: f.write(b"1,2,3\n" * 10) # globstring protocol = "file:///" if sys.platform == "win32" else "file://" urlpath = protocol + os.path.join(tmpdir, "test.*.csv") _, _, paths2 = get_fs_token_paths(urlpath) assert paths2 == paths # list of paths _, _, paths2 = get_fs_token_paths([protocol + p for p in paths]) assert paths2 == paths
def test_recursive_glob_expand(): """Make sure * is expanded in file paths when reading.""" with filetexts( {"sub1/afile.csv": b"", "sub1/sub2/another.csv": b"", "sub1/twofile.csv": b""}, mode="b", ): _, _, paths = get_fs_token_paths(os.path.abspath("**/*.csv")) assert len(paths) == 3
def __call__(self, task): """ Implements the "output()" method of a Luigi Task. This method allows the descriptor to be used as "output" composition of a Luigi Task. A Target (or subsclass) is instantiated and returned. The target file path as well as the "file_pattern" template is evaluated here. Args: task: host class instance Returns: A Luigi Target (or subclass) instance. """ # If there is a "glob" in target_kwargs, the extension is attached to the end of glob. # Otherwise, the extension is attached to end of flie_pattern. new_kwargs = { i: self.target_kwargs[i] for i in self.target_kwargs if i != 'glob' } if 'glob' in self.target_kwargs: target_path = self.file_pattern.format(task=task) new_glob = self.target_kwargs['glob'].format( task=task) + self.ext.format(task=task) new_kwargs['glob'] = new_glob else: target_path = (self.file_pattern.format(task=task) + self.ext.format(task=task)) # Make sure that the directory path ends with a system dependent separator. path_sep = get_fs_token_paths(target_path)[0].sep if target_path[-1] != path_sep: if target_path[-1] == "/": target_path = target_path[:-1] target_path = target_path + path_sep fs, _, _ = get_fs_token_paths(target_path) return self.target_class(target_path, **new_kwargs)
def _get_schema(self): if self._pf is None: # copied from dask to allow remote soptions = self._kwargs.pop('storage_options', {}) fs, fs_token, paths = get_fs_token_paths(self._urlpath, mode='rb', storage_options=soptions) if len(paths) > 1: pf = fp.ParquetFile(paths, open_with=fs.open, sep=fs.sep) else: try: pf = fp.ParquetFile(paths[0] + fs.sep + '_metadata', open_with=fs.open, sep=fs.sep) except Exception: pf = fp.ParquetFile(paths[0], open_with=fs.open, sep=fs.sep) self._pf = pf pf = self._pf if self._df is not None: return base.Schema(datashape=None, dtype=self._df._meta, shape=(pf.count, len(self._df.columns)), npartitions=self._df.npartitions, extra_metadata=pf.key_value_metadata) columns = self._kwargs.get('columns', None) if columns: dtypes = {k: v for k, v in pf.dtypes.items() if k in columns} else: dtypes = pf.dtypes if 'filters' in self._kwargs: rgs = pf.filter_row_groups(self._kwargs['filters']) parts = len(rgs) count = sum(rg.num_rows for rg in rgs) else: parts = len(pf.row_groups) count = pf.count return base.Schema( datashape=None, dtype=dtypes, # one of these is the index shape=(count, len(dtypes)), npartitions=parts, extra_metadata=pf.key_value_metadata)
def read_parquet(path, storage_options=None): """ Construct a SpatialPointsFrame from a spatially partitioned parquet file If the input parquet file does not contain compatible spatial metadata, then the resulting SpatialPointsFrame will have a .spatial property of None, and the spatial_query operation will be unavailable. Parameters ---------- path: str Path to a spatially partitioned parquet file that was created using datashader.spatial.points.to_parquet storage_options : dict or None (default None) Key/value pairs to be passed on to the file-system backend, if any. Returns ------- SpatialPointsFrame A spatially sorted Dask dataframe reconstructed from disk """ _validate_fastparquet() # Read parquet file frame = dd.read_parquet(path, storage_options=storage_options) # Open parquet file fs, _, paths = get_fs_token_paths(path, mode="rb", storage_options=storage_options) # Trim any protocol information from the path before forwarding path = fs._strip_protocol(path) pf = fp.ParquetFile(path, open_with=fs.open) # Check for spatial points metadata if 'SpatialPointsFrame' in pf.key_value_metadata: # Load metadata props = json.loads(pf.key_value_metadata['SpatialPointsFrame']) else: props = None # Call DataFrame constructor with the internals of frame return SpatialPointsFrame(frame.dask, frame._name, frame._meta, frame.divisions, props)
def test_glob(hdfs): tree = { basedir: (["c", "c2"], ["a", "a1", "a2", "a3", "b1"]), basedir + "/c": (["d"], ["x1", "x2"]), basedir + "/c2": (["d"], ["x1", "x2"]), basedir + "/c/d": ([], ["x3"]), } hdfs, _, _ = get_fs_token_paths("hdfs:///") hdfs.makedirs(basedir + "/c/d") hdfs.makedirs(basedir + "/c2/d/") for fn in (posixpath.join(dirname, f) for (dirname, (_, fils)) in tree.items() for f in fils): with hdfs.open(fn, mode="wb") as f2: f2.write(b"000") assert set(hdfs.glob(basedir + "/a*")) == { basedir + p for p in ["/a", "/a1", "/a2", "/a3"] } assert set(hdfs.glob(basedir + "/c/*")) == { basedir + p for p in ["/c/x1", "/c/x2", "/c/d"] } assert set(hdfs.glob(basedir + "/*/x*")) == { basedir + p for p in ["/c/x1", "/c/x2", "/c2/x1", "/c2/x2"] } assert set(hdfs.glob(basedir + "/*/x1")) == { basedir + p for p in ["/c/x1", "/c2/x1"] } assert hdfs.find("/this-path-doesnt-exist") == [] assert hdfs.find(basedir + "/missing/") == [] assert hdfs.find(basedir + "/missing/x1") == [] assert hdfs.glob(basedir + "/missing/*") == [] assert hdfs.glob(basedir + "/*/missing") == [] assert set(hdfs.glob(basedir + "/*")) == { basedir + p for p in ["/a", "/a1", "/a2", "/a3", "/b1", "/c", "/c2"] }
def test_urlpath_inference_errors(): # Empty list with pytest.raises(ValueError) as err: get_fs_token_paths([]) assert 'empty' in str(err) # Protocols differ with pytest.raises(ValueError) as err: get_fs_token_paths(['s3://test/path.csv', '/other/path.csv']) assert 'same protocol and options' in str(err) # Options differ with pytest.raises(ValueError) as err: get_fs_token_paths(['hdfs://[email protected]/test/path.csv', 'hdfs://[email protected]/other/path.csv']) assert 'same protocol and options' in str(err) # Unknown type with pytest.raises(TypeError): get_fs_token_paths({'sets/are.csv', 'unordered/so/they.csv', 'should/not/be.csv' 'allowed.csv'})
def read_avro(urlpath, blocksize=100000000, storage_options=None, compression=None): """Read set of avro files Use this with arbitrary nested avro schemas. Please refer to the fastavro documentation for its capabilities: https://github.com/fastavro/fastavro Parameters ---------- urlpath: string or list Absolute or relative filepath, URL (may include protocols like ``s3://``), or globstring pointing to data. blocksize: int or None Size of chunks in bytes. If None, there will be no chunking and each file will become one partition. storage_options: dict or None passed to backend file-system compression: str or None Compression format of the targe(s), like 'gzip'. Should only be used with blocksize=None. """ from dask.utils import import_required from dask import delayed, compute from dask.bytes.core import (open_files, get_fs_token_paths, OpenFile, tokenize) from dask.bag import from_delayed import_required( 'fastavro', "fastavro is a required dependency for using " "bag.read_avro().") storage_options = storage_options or {} if blocksize is not None: fs, fs_token, paths = get_fs_token_paths( urlpath, mode='rb', storage_options=storage_options) dhead = delayed(open_head) out = compute(*[dhead(fs, path, compression) for path in paths]) heads, sizes = zip(*out) dread = delayed(read_chunk) offsets = [] lengths = [] for size in sizes: off = list(range(0, size, blocksize)) length = [blocksize] * len(off) offsets.append(off) lengths.append(length) out = [] for path, offset, length, head in zip(paths, offsets, lengths, heads): delimiter = head['sync'] f = OpenFile(fs, path, compression=compression) token = tokenize(fs_token, delimiter, path, fs.ukey(path), compression, offset) keys = ['read-avro-%s-%s' % (o, token) for o in offset] values = [ dread(f, o, l, head, dask_key_name=key) for o, key, l in zip(offset, keys, length) ] out.extend(values) return from_delayed(out) else: files = open_files(urlpath, **storage_options) dread = delayed(read_file) chunks = [dread(fo) for fo in files] return from_delayed(chunks)
def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs): """Read cudf dataframe from ORC file(s). Note that this function is mostly borrowed from upstream Dask. Parameters ---------- path: str or list(str) Location of file(s), which can be a full URL with protocol specifier, and may include glob character if a single string. columns: None or list(str) Columns to load. If None, loads all. filters : None or list of tuple or list of lists of tuples If not None, specifies a filter predicate used to filter out row groups using statistics stored for each row group as Parquet metadata. Row groups that do not match the given filter predicate are not read. The predicate is expressed in disjunctive normal form (DNF) like `[[('x', '=', 0), ...], ...]`. DNF allows arbitrary boolean logical combinations of single column predicates. The innermost tuples each describe a single column predicate. The list of inner predicates is interpreted as a conjunction (AND), forming a more selective and multiple column predicate. Finally, the outermost list combines these filters as a disjunction (OR). Predicates may also be passed as a list of tuples. This form is interpreted as a single conjunction. To express OR in predicates, one must use the (preferred) notation of list of lists of tuples. storage_options: None or dict Further parameters to pass to the bytes backend. Returns ------- cudf.DataFrame """ storage_options = storage_options or {} fs, fs_token, paths = get_fs_token_paths( path, mode="rb", storage_options=storage_options ) schema = None nstripes_per_file = [] for path in paths: with fs.open(path, "rb") as f: o = orc.ORCFile(f) if schema is None: schema = o.schema elif schema != o.schema: raise ValueError( "Incompatible schemas while parsing ORC files" ) nstripes_per_file.append(o.nstripes) schema = _get_pyarrow_dtypes(schema, categories=None) if columns is not None: ex = set(columns) - set(schema) if ex: raise ValueError( "Requested columns (%s) not in schema (%s)" % (ex, set(schema)) ) else: columns = list(schema) with fs.open(paths[0], "rb") as f: meta = cudf.read_orc(f, stripes=[0], columns=columns, **kwargs) name = "read-orc-" + tokenize(fs_token, path, columns, **kwargs) dsk = {} N = 0 for path, n in zip(paths, nstripes_per_file): for stripe in ( range(n) if filters is None else cudf.io.orc._filter_stripes(filters, path) ): dsk[(name, N)] = ( _read_orc_stripe, fs, path, stripe, columns, kwargs, ) N += 1 divisions = [None] * (len(dsk) + 1) return dd.core.new_dd_object(dsk, name, meta, divisions)
def to_parquet(df, path, x, y, p=10, npartitions=None, shuffle=None, compression='default', storage_options=None): """ Perform spatial partitioning on an input dataframe and write the result to a parquet file. The resulting parquet file will contain the same columns as the input dataframe, but the dataframe's original index will be dropped. The resulting parquet file will contain all of the rows from the input dataframe, but they will be spatially sorted and partitioned along a 2D Hilbert curve (https://en.wikipedia.org/wiki/Hilbert_curve). The parquet file will also contain custom metadata that is needed to reconstruct the Hilbert curve distances on load. This parquet file may then be used to construct SpatialPointsFrame instances using datashader.spatial.points.read_parquet. Parameters ---------- df: pd.DataFrame or dd.DataFrame The input dataframe to partition path: str The path where the resulting parquet file should be written. See dask.dataframe.to_parquet for description of supported path specifications. x, y The column labels in df of the x and y coordinates of each row p: int (default 10) The Hilbert curve order parameter that determines the resolution of the 2D grid that data points are rounded to before computing their Hilbert distance. Points will be discretized into 2 ** p bins in each the x and y dimensions. This parameter should be increased if the partitions of the resulting parquet files are significantly unbalanced. npartitions: int or None (default None) The number of partitions for the resulting parquet file. If None (the default) this is chosen to be the greater of 8 and len(df) // 2**23. In general, increasing the number of partitions will improve performance when processing small subsets of the overall parquet data set. But this comes at the cost of some additional overhead when processing the entire data set. shuffle: str or None (default None) The dask.dataframe.DataFrame.set_index shuffle method. If None, a default is chosen based on the current scheduler. compression: str or None (default) The dask.dataframe.to_parquet compression method. storage_options : dict or None (default None) Key/value pairs to be passed on to the file-system backend, if any. """ _validate_fastparquet() # Validate filename if (not isinstance(path, basestring) or not (path.endswith('.parquet') or path.endswith('.parq'))): raise ValueError("""\ 'filename must be a string ending with a .parquet or .parq extension""") # Remove any existing directory if os.path.exists(path): shutil.rmtree(path) # Normalize to dask dataframe if isinstance(df, pd.DataFrame): ddf = dd.from_pandas(df, npartitions=4) elif isinstance(df, dd.DataFrame): ddf = df else: raise ValueError(""" df must be a pandas or dask DataFrame instance. Received value of type {typ}""".format(typ=type(df))) # Get number of rows nrows = len(df) # Compute npartitions if needed if npartitions is None: # Make partitions of ~8 million rows with a minimum of 8 # partitions npartitions = max(nrows // 2**23, 8) # Compute data extents extents = ddf.map_partitions(_compute_extents, x, y).compute() x_range = (float(extents['x_min'].min()), float(extents['x_max'].max())) y_range = (float(extents['y_min'].min()), float(extents['y_max'].max())) # Compute distance of points along the Hilbert-curve ddf = ddf.assign(distance=ddf.map_partitions(_compute_distance, x=x, y=y, p=p, x_range=x_range, y_range=y_range, as_series=True)) # Set index to distance. This will trigger an expensive shuffle # sort operation ddf = ddf.set_index('distance', npartitions=npartitions, shuffle=shuffle) # Get list of the distance divisions computed by dask distance_divisions = [int(d) for d in ddf.divisions] # Save properties as custom metadata in the parquet file props = dict(version='1.0', x=x, y=y, p=p, distance_divisions=distance_divisions, x_range=x_range, y_range=y_range, nrows=nrows) # Drop distance index to save storage space ddf = ddf.reset_index(drop=True) # Save ddf to parquet dd.to_parquet(ddf, path, engine='fastparquet', compression=compression, storage_options=storage_options) # Open resulting parquet file fs, _, paths = get_fs_token_paths(path, mode="wb", storage_options=storage_options) # Trim any protocol information from the path before forwarding path = fs._strip_protocol(path) pf = fp.ParquetFile(path, open_with=fs.open) # Add a new property to the file metadata new_fmd = copy.copy(pf.fmd) new_kv = fp.parquet_thrift.KeyValue() new_kv.key = 'SpatialPointsFrame' new_kv.value = json.dumps(props) new_fmd.key_value_metadata.append(new_kv) # Overwrite file metadata fn = os.path.join(path, '_metadata') fp.writer.write_common_metadata(fn, new_fmd, no_row_groups=False, open_with=fs.open) fn = os.path.join(path, '_common_metadata') fp.writer.write_common_metadata(fn, new_fmd, open_with=fs.open)
def touch(path, storage_options=None, _dep=None): fs, token, paths = get_fs_token_paths(path, storage_options=storage_options) with fs.open(path, mode="wb"): pass
def to_orc( df, path, write_index=True, storage_options=None, compression=None, compute=True, **kwargs, ): """Write a dask_cudf dataframe to ORC file(s) (one file per partition). Parameters ---------- df : dask_cudf.DataFrame path: string or pathlib.Path Destination directory for data. Prepend with protocol like ``s3://`` or ``hdfs://`` for remote data. write_index : boolean, optional Whether or not to write the index. Defaults to True. storage_options: None or dict Further parameters to pass to the bytes backend. compression : string or dict, optional compute : bool, optional If True (default) then the result is computed immediately. If False then a ``dask.delayed`` object is returned for future computation. """ from dask import delayed from dask import compute as dask_compute # TODO: Use upstream dask implementation once available # (see: Dask Issue#5596) if hasattr(path, "name"): path = stringify_path(path) fs, _, _ = get_fs_token_paths(path, mode="wb", storage_options=storage_options) # Trim any protocol information from the path before forwarding path = fs._strip_protocol(path) if write_index: df = df.reset_index() else: # Not writing index - might as well drop it df = df.reset_index(drop=True) fs.mkdirs(path, exist_ok=True) # Use i_offset and df.npartitions to define file-name list filenames = ["part.%i.orc" % i for i in range(df.npartitions)] # write parts dwrite = delayed(write_orc_partition) parts = [ dwrite(d, path, fs, filename, compression=compression) for d, filename in zip(df.to_delayed(), filenames) ] if compute: return dask_compute(*parts) return delayed(list)(parts)
def read_orc(path, columns=None, storage_options=None, **kwargs): """Read cudf dataframe from ORC file(s). Note that this function is mostly borrowed from upstream Dask. Parameters ---------- path: str or list(str) Location of file(s), which can be a full URL with protocol specifier, and may include glob character if a single string. columns: None or list(str) Columns to load. If None, loads all. storage_options: None or dict Further parameters to pass to the bytes backend. Returns ------- cudf.DataFrame """ storage_options = storage_options or {} fs, fs_token, paths = get_fs_token_paths(path, mode="rb", storage_options=storage_options) schema = None nstripes_per_file = [] for path in paths: with fs.open(path, "rb") as f: o = orc.ORCFile(f) if schema is None: schema = o.schema elif schema != o.schema: raise ValueError( "Incompatible schemas while parsing ORC files") nstripes_per_file.append(o.nstripes) schema = _get_pyarrow_dtypes(schema, categories=None) if columns is not None: ex = set(columns) - set(schema) if ex: raise ValueError("Requested columns (%s) not in schema (%s)" % (ex, set(schema))) else: columns = list(schema) with fs.open(paths[0], "rb") as f: meta = cudf.read_orc(f, stripe=0, columns=columns, **kwargs) name = "read-orc-" + tokenize(fs_token, path, columns, **kwargs) dsk = {} N = 0 for path, n in zip(paths, nstripes_per_file): for stripe in range(n): dsk[(name, N)] = ( _read_orc_stripe, fs, path, stripe, columns, kwargs, ) N += 1 divisions = [None] * (len(dsk) + 1) return dd.core.new_dd_object(dsk, name, meta, divisions)
def test_recursive_glob_expand(): """Make sure * is expanded in file paths when reading.""" with filetexts(csv_files, mode='b'): _, _, paths = get_fs_token_paths('**/.*.csv') assert len(paths) == 3
def test_recursive_glob_expand(): """Make sure * is expanded in file paths when reading.""" with filetexts(csv_files, mode="b"): _, _, paths = get_fs_token_paths("**/.*.csv") assert len(paths) == 3
def fs(self): fs, token, paths = get_fs_token_paths( self.path, storage_options=self.storage_options) return fs
def read_avro(urlpath, blocksize=100000000, storage_options=None, compression=None): """Read set of avro files Use this with arbitrary nested avro schemas. Please refer to the fastavro documentation for its capabilities: https://github.com/fastavro/fastavro Parameters ---------- urlpath: string or list Absolute or relative filepath, URL (may include protocols like ``s3://``), or globstring pointing to data. blocksize: int or None Size of chunks in bytes. If None, there will be no chunking and each file will become one partition. storage_options: dict or None passed to backend file-system compression: str or None Compression format of the targe(s), like 'gzip'. Should only be used with blocksize=None. """ from dask.utils import import_required from dask import delayed, compute from dask.bytes.core import (open_files, get_fs_token_paths, OpenFile, tokenize) from dask.bag import from_delayed import_required('fastavro', "fastavro is a required dependency for using " "bag.read_avro().") storage_options = storage_options or {} if blocksize is not None: fs, fs_token, paths = get_fs_token_paths( urlpath, mode='rb', storage_options=storage_options) dhead = delayed(open_head) out = compute(*[dhead(fs, path, compression) for path in paths]) heads, sizes = zip(*out) dread = delayed(read_chunk) offsets = [] lengths = [] for size in sizes: off = list(range(0, size, blocksize)) length = [blocksize] * len(off) offsets.append(off) lengths.append(length) out = [] for path, offset, length, head in zip(paths, offsets, lengths, heads): delimiter = head['sync'] f = OpenFile(fs, path, compression=compression) token = tokenize(fs_token, delimiter, path, fs.ukey(path), compression, offset) keys = ['read-avro-%s-%s' % (o, token) for o in offset] values = [dread(f, o, l, head, dask_key_name=key) for o, key, l in zip(offset, keys, length)] out.extend(values) return from_delayed(out) else: files = open_files(urlpath, compression=compression, **storage_options) dread = delayed(read_file) chunks = [dread(fo) for fo in files] return from_delayed(chunks)