def _reload(data, path, **kwargs): with open(path, 'wb') as f: d = DataFile(path, f) io.write_df(data, d, **kwargs) with open(path, 'rb') as f: d = DataFile(path, f) df = io.read_df(d) return df
def test_compression_pickle(randomdata, datadir_local): path = os.path.join(datadir_local, "tmp.pkl.gz") # write compressed with pandas randomdata.to_pickle(path, compression="gzip") # read compressed with blocks d = LocalDataFile(path, path) df = io.read_df(d) assert np.isclose(df, randomdata).all().all()
def merged(rgroup): frames = [] for cgroup in grouped: p = next(p for p in grouped[cgroup] if os.path.basename(p) == rgroup) args = read_args.copy() if cgroup in cgroup_args: args.update(cgroup_args[cgroup]) frames.append(read_df(p, **args)) return _merge_all(frames, merge=merge)
def merged(rgroup): frames = [] for cgroup in grouped: datafile = next(d for d in grouped[cgroup] if os.path.basename(d.path) == rgroup) args = read_args.copy() if cgroup in cgroup_args: args.update(cgroup_args[cgroup]) frames.append(read_df(datafile, **args)) return _merge_all(frames, merge=merge)
def assemble(path, cgroups=None, rgroups=None, read_args={}, cgroup_args={}, merge='inner', filesystem=GCSFileSystem()): """ Assemble multiple dataframe blocks into a single frame Each file included in the path (or subdirs of that path) is combined into a single dataframe by first concatenating over row groups and then merging over cgroups. The merges are performed in the order of listed cgroups if provided, otherwise in alphabetic order. Files are opened by a method inferred from their extension Parameters ---------- path : str The glob-able path to all datafiles to assemble into a frame e.g. gs://example/*/*, gs://example/*/part.0.pq, gs://example/c[1-2]/* See the README for a more detailed explanation cgroups : list of str, optional The list of cgroups (folder names) to include from the glob path rgroups : list of str, optional The list of rgroups (file names) to include from the glob path read_args : optional Any additional keyword args to pass to the read function cgroup_args : {cgroup: kwargs}, optional Any cgroup specific read arguments, where each key is the name of the cgroup and each value is a dictionary of keyword args merge : one of 'left', 'right', 'outer', 'inner', default 'inner' The merge strategy to pass to pandas.merge filesystem : blocks.filesystem.FileSystem or similar A filesystem object that implements the blocks.FileSystem API Returns ------- data : pd.DataFrame The combined dataframe from all the blocks """ grouped = _collect(path, cgroups, rgroups, filesystem) # ---------------------------------------- # Concatenate all rgroups # ---------------------------------------- frames = [] for group in grouped: datafiles = grouped[group] args = read_args.copy() if group in cgroup_args: args.update(cgroup_args[group]) frames.append(pd.concat(read_df(d, **args) for d in datafiles)) # ---------------------------------------- # Merge all cgroups # ---------------------------------------- return _merge_all(frames, merge=merge)
def test_compression_parquet(randomdata, datadir_local): pytest.importorskip("pyarrow") pytest.importorskip("pandas", minversion="0.22.0") path = os.path.join(datadir_local, "tmp.parquet.gz") # write compressed with pandas randomdata.to_parquet(path, compression="gzip") # read compressed with blocks d = LocalDataFile(path, path) df = io.read_df(d) assert np.isclose(df, randomdata).all().all() # write compressed with blocks d = LocalDataFile(path, path) io.write_df(randomdata, d) # read compressed with pandas df = pd.read_parquet(path) assert np.isclose(df, randomdata).all().all()
def test_compression_pickle(randomdata, datadir_local): path = os.path.join(datadir_local, 'tmp.pkl.gz') # write compressed with pandas randomdata.to_pickle(path, compression='gzip') # read compressed with blocks with open(path, 'rb') as f: d = DataFile(path, f) df = io.read_df(d) assert (np.isclose(df, randomdata).all().all()) # write compressed with blocks with open(path, 'wb') as f: d = DataFile(path, f) io.write_df(randomdata, d) # read compressed with pandas df = pd.read_pickle(path, compression='gzip') assert (np.isclose(df, randomdata).all().all())
def test_compression_parquet(randomdata, datadir_local): pytest.importorskip('pyarrow') pytest.importorskip('pandas', minversion='0.22.0') path = os.path.join(datadir_local, 'tmp.parquet.gz') # write compressed with pandas randomdata.to_parquet(path, compression='gzip') # read compressed with blocks with open(path, 'rb') as f: d = DataFile(path, f) df = io.read_df(d) assert (np.isclose(df, randomdata).all().all()) # write compressed with blocks with open(path, 'wb') as f: d = DataFile(path, f) io.write_df(randomdata, d) # read compressed with pandas df = pd.read_parquet(path) assert (np.isclose(df, randomdata).all().all())
def iterate( path: str, axis: int = -1, cgroups: Optional[Sequence[cgroup]] = None, rgroups: Optional[Sequence[rgroup]] = None, read_args: Any = {}, cgroup_args: Dict[cgroup, Any] = {}, merge: str = "inner", filesystem: FileSystem = FileSystem(), tmpdir: str = None, ) -> Union[Iterator[Tuple[cgroup, rgroup, pd.DataFrame]], Iterator[Tuple[ str, pd.DataFrame]]]: """Iterate over dataframe blocks Each file include in the path (or subdirs of that path) is opened as a dataframe and returned in a generator of (cname, rname, dataframe). Files are opened by a method inferred from their extension Parameters ---------- path : str The glob-able path to all files to assemble into a frame e.g. gs://example/*/*, gs://example/*/part.0.pq, gs://example/c[1-2]/* See the README for a more detailed explanation axis : int, default -1 The axis to iterate along If -1 (the default), iterate over both columns and rows If 0, iterate over the rgroups, combining any cgroups If 1, iterate over the cgroups, combining any rgroups cgroups : list of str, or {str: args} optional The list of cgroups (folder names) to include from the glob path rgroups : list of str, optional The list of rgroups (file names) to include from the glob path read_args : dict, optional Any additional keyword args to pass to the read function cgroup_args : {cgroup: kwargs}, optional Any cgroup specific read arguments, where each key is the name of the cgroup and each value is a dictionary of keyword args merge : one of 'left', 'right', 'outer', 'inner', default 'inner' The merge strategy to pass to pandas.merge, only used when axis=0 filesystem : blocks.filesystem.FileSystem or similar A filesystem object that implements the blocks.FileSystem API Returns ------- data : generator A generator of (cname, rname, dataframe) for each collected path If axis=0, yields (rname, dataframe) If axis=1, yields (cname, dataframe) """ grouped = _collect(path, cgroups, rgroups, filesystem, tmpdir) if axis == -1: for cgroup in grouped: args = read_args.copy() if cgroup in cgroup_args: args.update(cgroup_args[cgroup]) for path in grouped[cgroup]: yield _cname(path), _rname(path), read_df(path, **args) elif axis == 0: # find the shared files among all subfolders rgroups = _shared_rgroups(grouped) for rgroup in sorted(rgroups): frames = [] for cgroup in grouped: path = next(d for d in grouped[cgroup] if _rname(d) == rgroup) args = read_args.copy() if cgroup in cgroup_args: args.update(cgroup_args[cgroup]) frames.append(read_df(path, **args)) yield rgroup, _merge_all(frames, merge=merge) elif axis == 1: for cgroup in grouped: files = grouped[cgroup] args = read_args.copy() if cgroup in cgroup_args: args.update(cgroup_args[cgroup]) yield cgroup, pd.concat(read_df(path, **args) for path in files) else: raise ValueError("Invalid choice for axis, options are -1, 0, 1")
def _reload(data, path, **kwargs): d = LocalDataFile(path, path) io.write_df(data, d, **kwargs) d = LocalDataFile(path, path) df = io.read_df(d) return df