def _write_dataframe(dataframe, path): path = data_util.make_data_path(path) _make_directory_if_needed(path) table = pa.Table.from_pandas(dataframe, preserve_index=False) pq.write_table(table, path)
def _write_dataframe(dataframe, path): if dataframe.columns.empty: raise Exception('Empty DataFrame cannot be written.') if not path.startswith('hdfs://'): path = data_util.make_data_path(path) _remove_exist_directory_if_dir(path) _make_directory_if_needed(path) table = pa.Table.from_pandas(dataframe, preserve_index=False) pq.write_table(table, path)
def write_to_dataset(table, root_path, partition_cols=None, **kwargs): _make_directory_if_needed(brtc_data_utils.make_data_path(root_path)) if partition_cols is not None and len(partition_cols) > 0: df = table.to_pandas() partition_keys = [df[col] for col in partition_cols] data_df = df.drop(partition_cols, axis='columns') data_cols = df.columns.drop(partition_cols) if len(data_cols) == 0: raise ValueError("No data left to save outside partition columns") for keys, subgroup in data_df.groupby(partition_keys): if not isinstance(keys, tuple): keys = (keys, ) subdir = "/".join([ "{colname}={value}".format(colname=name, value=val) for name, val in zip(partition_cols, keys) ]) subtable = pa.Table.from_pandas(subgroup, preserve_index=False) prefix = "/".join([root_path, subdir]) _make_directory_if_needed(brtc_data_utils.make_data_path(prefix)) outfile = compat.guid() + ".parquet" full_path = "/".join([prefix, outfile]) full_path = brtc_data_utils.make_data_path(full_path) _make_directory_if_needed( brtc_data_utils.make_data_path(full_path)) pq.write_table(subtable, brtc_data_utils.make_data_path(full_path), **kwargs) else: outfile = compat.guid() + ".parquet" full_path = "/".join([root_path, outfile]) pq.write_table(table, brtc_data_utils.make_data_path(full_path), **kwargs)
def to_parquet(df, path, njobs=4): print(path) path = brtc_data_utils.make_data_path(path) print(path) os.makedirs(path) pool = multiprocessing.pool.ThreadPool() paths = [] for grp, sample in df.groupby( lambda _: np.random.choice(range(njobs), 1)[0]): sub_path = os.path.join(path, '{}'.format(grp)) paths.append(sub_path) pool.apply_async(_write_parquet, (sample, sub_path)) pool.close() pool.join() return paths
def read_parquet(path): return table_reader.read_parquet(data_util.make_data_path(path))