def place( df: pd.DataFrame, path: str, filesystem: FileSystem = FileSystem(), tmpdir: str = None, **write_args, ) -> None: """Place a dataframe block onto the filesystem at the specified path Parameters ---------- df : pd.DataFrame The data to place path : str Path to the directory (possibly on GCS) in which to place the columns write_args : dict Any additional args to pass to the write function filesystem : blocks.filesystem.FileSystem or similar A filesystem object that implements the blocks.FileSystem API """ fname = os.path.basename(path) tmp = os.path.join(tmpdir, fname) write_df(df, tmp, **write_args) filesystem.copy(tmp, path)
def _reload(data, path, **kwargs): with open(path, 'wb') as f: d = DataFile(path, f) io.write_df(data, d, **kwargs) with open(path, 'rb') as f: d = DataFile(path, f) df = io.read_df(d) return df
def test_compression_parquet(randomdata, datadir_local): pytest.importorskip("pyarrow") pytest.importorskip("pandas", minversion="0.22.0") path = os.path.join(datadir_local, "tmp.parquet.gz") # write compressed with pandas randomdata.to_parquet(path, compression="gzip") # read compressed with blocks d = LocalDataFile(path, path) df = io.read_df(d) assert np.isclose(df, randomdata).all().all() # write compressed with blocks d = LocalDataFile(path, path) io.write_df(randomdata, d) # read compressed with pandas df = pd.read_parquet(path) assert np.isclose(df, randomdata).all().all()
def place(df, path, filesystem=GCSFileSystem(), **write_args): """ Place a dataframe block onto the filesystem at the specified path Parameters ---------- df : pd.DataFrame The data to place path : str Path to the directory (possibly on GCS) in which to place the columns write_args : dict Any additional args to pass to the write function filesystem : blocks.filesystem.FileSystem or similar A filesystem object that implements the blocks.FileSystem API """ bucket, fname = os.path.dirname(path), os.path.basename(path) with filesystem.store(bucket, [fname]) as datafiles: write_df(df, datafiles[0], **write_args)
def test_compression_pickle(randomdata, datadir_local): path = os.path.join(datadir_local, 'tmp.pkl.gz') # write compressed with pandas randomdata.to_pickle(path, compression='gzip') # read compressed with blocks with open(path, 'rb') as f: d = DataFile(path, f) df = io.read_df(d) assert (np.isclose(df, randomdata).all().all()) # write compressed with blocks with open(path, 'wb') as f: d = DataFile(path, f) io.write_df(randomdata, d) # read compressed with pandas df = pd.read_pickle(path, compression='gzip') assert (np.isclose(df, randomdata).all().all())
def test_compression_parquet(randomdata, datadir_local): pytest.importorskip('pyarrow') pytest.importorskip('pandas', minversion='0.22.0') path = os.path.join(datadir_local, 'tmp.parquet.gz') # write compressed with pandas randomdata.to_parquet(path, compression='gzip') # read compressed with blocks with open(path, 'rb') as f: d = DataFile(path, f) df = io.read_df(d) assert (np.isclose(df, randomdata).all().all()) # write compressed with blocks with open(path, 'wb') as f: d = DataFile(path, f) io.write_df(randomdata, d) # read compressed with pandas df = pd.read_parquet(path) assert (np.isclose(df, randomdata).all().all())
def divide( df: pd.DataFrame, path: str, n_rgroup: int = 1, rgroup_offset: int = 0, cgroup_columns: Optional[Dict[Optional[cgroup], Sequence[str]]] = None, extension: str = ".pq", convert: bool = False, filesystem: FileSystem = FileSystem(), prefix=None, tmpdir: str = None, **write_args, ) -> None: """Split a dataframe into rgroups/cgroups and save to disk Note that this splitting does not preserve the original index, so make sure to have another column to track values Parameters ---------- df : pd.DataFrame The data to divide path : str Path to the directory (possibly on GCS) in which to place the columns n_rgroup : int, default 1 The number of row groups to partition the data into The rgroups will have approximately equal sizes rgroup_offset : int, default 0 The index to start from in the name of file parts e.g. If rgroup_offset=10 then the first file will be `part_00010.pq` cgroup_columns : {cgroup: list of column names} The column lists to form cgroups; if None, do not make cgroups Each key is the name of the cgroup, and each value is the list of columns to include To reassemble later make sure to include join keys for each cgroup extension : str, default .pq The file extension for the dataframe (file type inferred from this extension convert : bool, default False If true attempt to coerce types to numeric. This can avoid issues with ambiguous object columns but requires additional time filesystem : blocks.filesystem.FileSystem or similar A filesystem object that implements the blocks.FileSystem API prefix: str Prefix to add to written filenames write_args : dict Any additional args to pass to the write function """ # Use a single dummy cgroup if None wanted if cgroup_columns is None: cgroup_columns = {None: df.columns} # Add leading dot if not in extension if extension[0] != ".": extension = "." + extension if convert: for col in df.columns: df[col] = pd.to_numeric(df[col], errors="ignore") files = [] for cname, columns in cgroup_columns.items(): cgroup = df[columns] bucket = os.path.join(path, cname) if cname else path tmp_cgroup = os.path.join(tmpdir, cname) if cname else tmpdir if not filesystem.isdir(tmp_cgroup): filesystem.mkdir(tmp_cgroup) rnames = [ "part_{:05d}{}".format(i + rgroup_offset, extension) for i in range(n_rgroup) ] if prefix is not None: rnames = [prefix + "_" + rn for rn in rnames] for rgroup, rname in zip(np.array_split(cgroup, n_rgroup), rnames): tmp = os.path.join(tmp_cgroup, rname) write_df(rgroup.reset_index(drop=True), tmp, **write_args) files.append((cname, rname) if cname else (rname, )) filesystem.copy( [os.path.join(tmpdir, *f) for f in files], [os.path.join(path, *f) for f in files], )
def _reload(data, path, **kwargs): d = LocalDataFile(path, path) io.write_df(data, d, **kwargs) d = LocalDataFile(path, path) df = io.read_df(d) return df
def divide( df, path, n_rgroup=1, rgroup_offset=0, cgroup_columns=None, extension='.pq', convert=False, filesystem=GCSFileSystem(), **write_args ): """ Split a dataframe into rgroups/cgroups and save to disk Note that this splitting does not preserve the original index, so make sure to have another column to track values Parameters ---------- df : pd.DataFrame The data to divide path : str Path to the directory (possibly on GCS) in which to place the columns n_rgroup : int, default 1 The number of row groups to partition the data into The rgroups will have approximately equal sizes rgroup_offset : int, default 0 The index to start from in the name of file parts e.g. If rgroup_offset=10 then the first file will be `part_00010.pq` cgroup_columns : {cgroup: list of column names} The column lists to form cgroups; if None, do not make cgroups Each key is the name of the cgroup, and each value is the list of columns to include To reassemble later make sure to include join keys for each cgroup extension : str, default .pq The file extension for the dataframe (file type inferred from this extension convert : bool, default False If true attempt to coerce types to numeric. This can avoid issues with amiguous object columns but requires additional time filesystem : blocks.filesystem.FileSystem or similar A filesystem object that implements the blocks.FileSystem API write_args : dict Any additional args to pass to the write function """ # Use a single dummy cgroup if None wanted if cgroup_columns is None: cgroup_columns = {None: df.columns} # Add leading dot if not in extension if extension[0] != '.': extension = '.'+extension if convert: for col in df.columns: df[col] = pd.to_numeric(df[col], errors='ignore') for cname, columns in cgroup_columns.items(): cgroup = df[columns] bucket = os.path.join(path, cname) if cname else path rnames = ['part_{:05d}{}'.format(i+rgroup_offset, extension) for i in range(n_rgroup)] with filesystem.store(bucket, rnames) as datafiles: for rgroup, d in zip(np.array_split(cgroup, n_rgroup), datafiles): write_df(rgroup.reset_index(drop=True), d, **write_args)