def register_file_by_path(self, path): root, fname = os.path.split(path) tablename, _ = os.path.splitext(fname) fpath = os.path.join(root, fname) loaded = False exception = None for sep in [',', '|', '\t']: df = None # TODO delete columnar_tb = None try: with openfile(fpath) as f: df = pandas.read_csv(f, sep=sep) # TODO delete columnar_tb = pa_tb.from_pandas(df) except Exception as e: exception = e if df is not None and columnar_tb is not None: self.register_dataframe(tablename, df) self.register_columnar_tb(tablename, columnar_tb) loaded = True break if not loaded: print("Failed to read data file %s" % (fpath)) print(exception)
def _geopandas_to_arrow(df, index=None): """ Helper function with main, shared logic for to_parquet/to_feather. """ from pyarrow import Table warnings.warn( "this is an initial implementation of Parquet/Feather file support and " "associated metadata. This is tracking version 0.1.0 of the metadata " "specification at " "https://github.com/geopandas/geo-arrow-spec\n\n" "This metadata specification does not yet make stability promises. " "We do not yet recommend using this in a production setting unless you " "are able to rewrite your Parquet/Feather files.\n\n" "To further ignore this warning, you can do: \n" "import warnings; warnings.filterwarnings('ignore', " "message='.*initial implementation of Parquet.*')", UserWarning, stacklevel=4, ) _validate_dataframe(df) # create geo metadata before altering incoming data frame geo_metadata = _create_metadata(df) df = _encode_wkb(df) table = Table.from_pandas(df, preserve_index=index) # Store geopandas specific file-level metadata # This must be done AFTER creating the table or it is not persisted metadata = table.schema.metadata metadata.update({b"geo": _encode_metadata(geo_metadata)}) return table.replace_schema_metadata(metadata)
def test_parquet_invalid_metadata(tmpdir, geo_meta, error): """Has geo metadata with missing required fields will raise a ValueError. This requires writing the parquet file directly below, so that we can control the metadata that is written for this test. """ from pyarrow import parquet, Table test_dataset = "naturalearth_lowres" df = read_file(get_path(test_dataset)) # convert to DataFrame and encode geometry to WKB df = DataFrame(df) df["geometry"] = to_wkb(df["geometry"].values) table = Table.from_pandas(df) metadata = table.schema.metadata metadata.update(geo_meta) table = table.replace_schema_metadata(metadata) filename = os.path.join(str(tmpdir), "test.pq") parquet.write_table(table, filename) with pytest.raises(ValueError, match=error): read_parquet(filename)
def data_table(self): df_to_sent = DataFrame( { 'Brand': ['Honda Civic', 'Toyota Corolla', 'Ford Focus', 'Audi A4'], 'Price': [22000, 25000, 27000, 35000] }, columns=['Brand', 'Price']) return Table.from_pandas(df_to_sent)
def hand_in_result(self): """ GroupBy works as follows: * Contruct and populate hash table with key defined by the group_exprs expressions * Iterate through each bucket, compose and populate all tuples that conforms to this operator's output schema (see self.init_schema) """ handin_res = self.c.hand_in_result() if handin_res.is_terminate(): return ListColumns(self.schema, None) # hash(key): [attr_pos, gr] hashtable = defaultdict(lambda: [None, None, []]) # schema for non-aggregation project exprs termrow = ListColumns(self.group_term_schema) groupval_cols = [] for expr in self.group_exprs: groupval_cols.append(expr(handin_res)) if not groupval_cols: new_columns = [] for expr in self.project_exprs: new_columns.append(expr(handin_res)) return ListColumns(self.schema, new_columns) for idx in range(groupval_cols[0].length()): groupval = tuple([col[idx] for col in groupval_cols]) key = hash(groupval) if not hashtable[key][0]: hashtable[key][0] = groupval hashtable[key][1] = [ attr(handin_res)[idx] for attr in self.group_attrs ] hashtable[key][2].append(idx) res_rows = [] for _, (key, attrvals, group) in list(hashtable.items()): group_list_columns = ListColumns( handin_res.schema, [col.take(group) if col else None for col in handin_res]) row = [] for expr in self.project_exprs: if expr.is_type(AggFunc): row.append(expr(group_list_columns).as_py()) else: termrow.columns = attrvals row.append(expr(termrow).as_py()) res_rows.append(row) return ListColumns(self.schema, Table.from_pandas(pd.DataFrame(res_rows)).columns)
def save_orc_file(dataframe, filepath): """Utility function to write dataframe to disk as orc file.""" from pyarrow import Table, orc df = dataframe.copy() for c in df: if df[c].dtype.name == "category": df[c] = df[c].astype("string[pyarrow]") pa_table = Table.from_pandas(df, preserve_index=False) orc.write_table(pa_table, filepath)
def convert_csv_to_parquet(infile, outfile): """ Convert csv file in parquet file """ try: csv_data = pd.read_csv(infile, index_col=False, header=0) csv_table = Table.from_pandas(csv_data, preserve_index=True) if not outfile: outfile = get_file_name(infile) + '.parquet' parquet.write_table(csv_table, outfile) except BaseException as e: raise e
def _geopandas_to_arrow(df, index=None): """ Helper function with main, shared logic for to_parquet/to_feather. """ from pyarrow import Table _validate_dataframe(df) # create geo metadata before altering incoming data frame geo_metadata = _create_metadata(df) df = df.to_wkb() table = Table.from_pandas(df, preserve_index=index) # Store geopandas specific file-level metadata # This must be done AFTER creating the table or it is not persisted metadata = table.schema.metadata metadata.update({b"geo": _encode_metadata(geo_metadata)}) return table.replace_schema_metadata(metadata)
def write_to_dataset(table, root_path, partition_cols=None, filesystem=None, preserve_index=True, **kwargs): """ Wrapper around parquet.write_table for writing a Table to Parquet format by partitions. For each combination of partition columns and values, a subdirectories are created in the following manner: root_dir/ group1=value1 group2=value1 <uuid>.parquet group2=value2 <uuid>.parquet group1=valueN group2=value1 <uuid>.parquet group2=valueN <uuid>.parquet Parameters ---------- table : pyarrow.Table root_path : string, The root directory of the dataset filesystem : FileSystem, default None If nothing passed, paths assumed to be found in the local on-disk filesystem partition_cols : list, Column names by which to partition the dataset Columns are partitioned in the order they are given preserve_index : bool, Parameter for instantiating Table; preserve pandas index or not. **kwargs : dict, kwargs for write_table function. """ from pyarrow import (Table, compat) if filesystem is None: fs = LocalFileSystem.get_instance() else: fs = _ensure_filesystem(filesystem) _mkdir_if_not_exists(fs, root_path) if partition_cols is not None and len(partition_cols) > 0: df = table.to_pandas() partition_keys = [df[col] for col in partition_cols] data_df = df.drop(partition_cols, axis='columns') data_cols = df.columns.drop(partition_cols) if len(data_cols) == 0: raise ValueError("No data left to save outside partition columns") for keys, subgroup in data_df.groupby(partition_keys): if not isinstance(keys, tuple): keys = (keys, ) subdir = "/".join([ "{colname}={value}".format(colname=name, value=val) for name, val in zip(partition_cols, keys) ]) subtable = Table.from_pandas(subgroup, preserve_index=preserve_index) prefix = "/".join([root_path, subdir]) _mkdir_if_not_exists(fs, prefix) outfile = compat.guid() + ".parquet" full_path = "/".join([prefix, outfile]) with fs.open(full_path, 'wb') as f: write_table(subtable, f, **kwargs) else: outfile = compat.guid() + ".parquet" full_path = "/".join([root_path, outfile]) with fs.open(full_path, 'wb') as f: write_table(table, f, **kwargs)
def write_to_dataset(table, root_path, partition_cols=None, filesystem=None, preserve_index=True, **kwargs): """ Wrapper around parquet.write_table for writing a Table to Parquet format by partitions. For each combination of partition columns and values, a subdirectories are created in the following manner: root_dir/ group1=value1 group2=value1 <uuid>.parquet group2=value2 <uuid>.parquet group1=valueN group2=value1 <uuid>.parquet group2=valueN <uuid>.parquet Parameters ---------- table : pyarrow.Table root_path : string, The root directory of the dataset filesystem : FileSystem, default None If nothing passed, paths assumed to be found in the local on-disk filesystem partition_cols : list, Column names by which to partition the dataset Columns are partitioned in the order they are given preserve_index : bool, Parameter for instantiating Table; preserve pandas index or not. **kwargs : dict, kwargs for write_table function. """ from pyarrow import ( Table, compat ) if filesystem is None: fs = _get_fs_from_path(root_path) else: fs = _ensure_filesystem(filesystem) _mkdir_if_not_exists(fs, root_path) if partition_cols is not None and len(partition_cols) > 0: df = table.to_pandas() partition_keys = [df[col] for col in partition_cols] data_df = df.drop(partition_cols, axis='columns') data_cols = df.columns.drop(partition_cols) if len(data_cols) == 0: raise ValueError("No data left to save outside partition columns") for keys, subgroup in data_df.groupby(partition_keys): if not isinstance(keys, tuple): keys = (keys,) subdir = "/".join( ["{colname}={value}".format(colname=name, value=val) for name, val in zip(partition_cols, keys)]) subtable = Table.from_pandas(subgroup, preserve_index=preserve_index) prefix = "/".join([root_path, subdir]) _mkdir_if_not_exists(fs, prefix) outfile = compat.guid() + ".parquet" full_path = "/".join([prefix, outfile]) with fs.open(full_path, 'wb') as f: write_table(subtable, f, **kwargs) else: outfile = compat.guid() + ".parquet" full_path = "/".join([root_path, outfile]) with fs.open(full_path, 'wb') as f: write_table(table, f, **kwargs)
def _to_parquet(df, path, compression="snappy", index=None, **kwargs): """ Write a GeoDataFrame to the Parquet format. Any geometry columns present are serialized to WKB format in the file. Requires 'pyarrow'. WARNING: this is an initial implementation of Parquet file support and associated metadata. This is tracking version 0.1.0 of the metadata specification at: https://github.com/geopandas/geo-arrow-spec This metadata specification does not yet make stability promises. As such, we do not yet recommend using this in a production setting unless you are able to rewrite your Parquet files. .. versionadded:: 0.8 Parameters ---------- path : str, path object compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy' Name of the compression to use. Use ``None`` for no compression. index : bool, default None If ``True``, always include the dataframe's index(es) as columns in the file output. If ``False``, the index(es) will not be written to the file. If ``None``, the index(ex) will be included as columns in the file output except `RangeIndex` which is stored as metadata only. kwargs Additional keyword arguments passed to pyarrow.parquet.write_table(). """ import_optional_dependency( "pyarrow.parquet", extra="pyarrow is required for Parquet support.") from pyarrow import parquet, Table warnings.warn( "this is an initial implementation of Parquet file support and " "associated metadata. This is tracking version 0.1.0 of the metadata " "specification at " "https://github.com/geopandas/geo-arrow-spec\n\n" "This metadata specification does not yet make stability promises. " "We do not yet recommend using this in a production setting unless you " "are able to rewrite your Parquet files.\n\n" "To further ignore this warning, you can do: \n" "import warnings; warnings.filterwarnings('ignore', " "message='.*initial implementation of Parquet.*')", UserWarning, stacklevel=3, ) _validate_dataframe(df) # create geo metadata before altering incoming data frame geo_metadata = _create_metadata(df) df = _encode_wkb(df) table = Table.from_pandas(df, preserve_index=index) # Store geopandas specific file-level metadata # This must be done AFTER creating the table or it is not persisted metadata = table.schema.metadata metadata.update({b"geo": _encode_metadata(geo_metadata)}) table = table.replace_schema_metadata(metadata) parquet.write_table(table, path, compression=compression, **kwargs)
def write(self, df, path): self._check_no_duplicate_cols(df) if self._check_dtypes: self._check_no_categorical_cols(df) with path.open('wb') as file_: parquet.write_table(Table.from_pandas(df), file_)
def write_path(self, dataset): table = Table.from_pandas(dataset) write_table(table, self.path)
def upsert_to_dataset(table, root_path, partition_cols=None, filesystem=None, preserve_index=True, temp_folder=None, categories=None, **kwargs): if filesystem is None: fs = _get_fs_from_path(root_path) else: fs = _ensure_filesystem(filesystem) _mkdir_if_not_exists(fs, root_path) if temp_folder: if not os.path.exists(temp_folder): temp_folder = None if partition_cols is not None and len(partition_cols) > 0: # df is the data in the new table df = table.to_pandas() partition_keys = [df[col] for col in partition_cols] data_df = df.drop(partition_cols, axis='columns') data_cols = df.columns.drop(partition_cols) if len(data_cols) == 0: raise ValueError("No data left to save outside partition columns") subschema = table.schema # ARROW-2891: Ensure the output_schema is preserved when writing a # partitioned dataset for partition_col in partition_cols: subschema = subschema.remove( subschema.get_field_index(partition_col)) for keys, subgroup in data_df.groupby(partition_keys): if not isinstance(keys, tuple): keys = (keys,) subdir = "/".join( ["{colname}={value}".format(colname=name, value=val) for name, val in zip(partition_cols, keys)]) prefix = "/".join([root_path, subdir]) _mkdir_if_not_exists(fs, prefix) existing_files = [f for f in os.listdir(prefix) if f.endswith('.parquet')] if len(existing_files) > 1: raise ValueError('Unsupported scenario, multiple files found in path %s' % prefix) if len(existing_files) == 1: outfile = existing_files[0] full_path = "/".join([prefix, outfile]) old_table = read_table(full_path) category_cols = _to_category_cols(subgroup, categories) # get categories before merging old_subgroup = old_table.to_pandas() # TODO: compare old schema with new subgroup = _upsert_dataframes(subgroup, old_subgroup) # subgroup = pd.concat([subgroup, old_subgroup[~old_subgroup.index.isin(subgroup.index.values)]]) for c, v in category_cols.items(): subgroup.loc[:, c] = subgroup.loc[:, c].astype('category', categories=v) else: outfile = compat.guid() + ".parquet" full_path = "/".join([prefix, outfile]) subtable = Table.from_pandas(subgroup, preserve_index=preserve_index, schema=subschema) write_file = os.path.join(temp_folder, outfile) if temp_folder else full_path with fs.open(write_file, 'wb') as f: write_table(subtable, f, **kwargs) if temp_folder: shutil.move(write_file, full_path) else: existing_files = [f for f in os.listdir(root_path) if f.endswith('.parquet')] if len(existing_files) > 1: raise ValueError('Unsupported scenario, multiple files found in path %s' % root_path) if len(existing_files) == 1: # append use case outfile = existing_files[0] full_path = "/".join([root_path, outfile]) old_table = read_table(full_path) subgroup = table.to_pandas() category_cols = _to_category_cols(subgroup, categories) old_subgroup = old_table.to_pandas() # TODO: compare old schema with new subgroup = _upsert_dataframes(subgroup, old_subgroup) # subgroup = pd.concat([old_subgroup[~old_subgroup.index.isin(subgroup.index)], subgroup]) for c, v in category_cols.items(): subgroup.loc[:, c] = subgroup.loc[:, c].astype('category', categories=v) schema = table.schema table = Table.from_pandas( subgroup, preserve_index=preserve_index, schema=schema ) else: # write use case outfile = compat.guid() + ".parquet" full_path = "/".join([root_path, outfile]) write_file = os.path.join(temp_folder, outfile) if temp_folder else full_path with fs.open(write_file, 'wb') as f: write_table(table, f, **kwargs) if temp_folder: shutil.move(write_file, full_path)
from pandas import DataFrame from pyarrow import parquet, Table data_file = '1000SalesRecords.csv' separator_char = ',' with open(data_file, 'r') as file: headers = [i.strip().strip('"') for i in file.readline().strip().split(separator_char)] dict_of_lists = {i: [] for i in headers} file_data = file.read().splitlines() for line in file_data: split_line = line.strip().split(separator_char) for col_name, val in zip(headers, split_line): dict_of_lists[col_name].append(val.strip()) df = DataFrame(dict_of_lists) table = Table.from_pandas(df=df, preserve_index=False) parquet.write_table(table, 'example_noindex.parquet')
import numpy as np import pandas as pd import pickle import os from datetime import datetime import pyarrow.parquet as pq from pyarrow import Table import multiprocessing as mp import collections import itertools # Creates a miniature dataset for testing purposes # with the first 20000 entries. filename = 'part-00000-8bbff892-97d2-4011-9961-703e38972569.c000.snappy.parquet' df = pq.read_table(filename).to_pandas() mini_df = df.head(20000) table = Table.from_pandas(mini_df, nthreads=1) pq.write_table(table, 'mini.parquet')
def write(self, df, file_): if self._check_dtypes: self._check_no_categorical_cols(df) parquet.write_table(Table.from_pandas(df), file_)
def from_frame(data, index=False, schema=None): return Table.from_pandas(data, preserve_index=index, schema=schema)
def to_arrow(df): return Table.from_pandas(df)