def read_table(source, columns=None, nthreads=1, metadata=None): """ Read a Table from Parquet format Parameters ---------- source: str or pyarrow.io.NativeFile Location of Parquet dataset. If a string passed, can be a single file name or directory name. For passing Python file objects or byte buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. columns: list If not None, only these columns will be read from the file. nthreads : int, default 1 Number of columns to read in parallel. Requires that the underlying file source is threadsafe metadata : FileMetaData If separately computed Returns ------- pyarrow.Table Content of the file as a table (of columns) """ if is_string(source): fs = LocalFilesystem.get_instance() if fs.isdir(source): return fs.read_parquet(source, columns=columns, metadata=metadata) pf = ParquetFile(source, metadata=metadata) return pf.read(columns=columns, nthreads=nthreads)
def read_pandas(source, columns=None, nthreads=1, metadata=None): """ Read a Table from Parquet format, reconstructing the index values if available. Parameters ---------- source: str or pyarrow.io.NativeFile Location of Parquet dataset. If a string passed, can be a single file name. For passing Python file objects or byte buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. columns: list If not None, only these columns will be read from the file. nthreads : int, default 1 Number of columns to read in parallel. Requires that the underlying file source is threadsafe metadata : FileMetaData If separately computed Returns ------- pyarrow.Table Content of the file as a Table of Columns, including DataFrame indexes as Columns. """ if is_string(source): fs = LocalFilesystem.get_instance() if fs.isdir(source): raise NotImplementedError( 'Reading a directory of Parquet files with DataFrame index ' 'metadata is not yet supported' ) pf = ParquetFile(source, metadata=metadata) return pf.read_pandas(columns=columns, nthreads=nthreads)
def read_table(source, columns=None, nthreads=1, metadata=None): """ Read a Table from Parquet format Parameters ---------- source: str or pyarrow.io.NativeFile Location of Parquet dataset. If a string passed, can be a single file name or directory name. For passing Python file objects or byte buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. columns: list If not None, only these columns will be read from the file. nthreads : int, default 1 Number of columns to read in parallel. Requires that the underlying file source is threadsafe metadata : FileMetaData If separately computed Returns ------- pyarrow.Table Content of the file as a table (of columns) """ from pyarrow.filesystem import LocalFilesystem if isinstance(source, six.string_types): fs = LocalFilesystem.get_instance() if fs.isdir(source): return fs.read_parquet(source, columns=columns, metadata=metadata) pf = ParquetFile(source, metadata=metadata) return pf.read(columns=columns, nthreads=nthreads)
def _generate_partition_directories(base_dir, partition_spec, df): # partition_spec : list of lists, e.g. [['foo', [0, 1, 2], # ['bar', ['a', 'b', 'c']] # part_table : a pyarrow.Table to write to each partition DEPTH = len(partition_spec) fs = LocalFilesystem.get_instance() def _visit_level(base_dir, level, part_keys): name, values = partition_spec[level] for value in values: this_part_keys = part_keys + [(name, value)] level_dir = pjoin(base_dir, '{0}={1}'.format(name, value)) fs.mkdir(level_dir) if level == DEPTH - 1: # Generate example data file_path = pjoin(level_dir, 'data.parq') filtered_df = _filter_partition(df, this_part_keys) part_table = pa.Table.from_pandas(filtered_df) _write_table(part_table, file_path) else: _visit_level(level_dir, level + 1, this_part_keys) _visit_level(base_dir, 0, [])
def read_pandas(source, columns=None, nthreads=1, metadata=None): """ Read a Table from Parquet format, reconstructing the index values if available. Parameters ---------- source: str or pyarrow.io.NativeFile Location of Parquet dataset. If a string passed, can be a single file name. For passing Python file objects or byte buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. columns: list If not None, only these columns will be read from the file. nthreads : int, default 1 Number of columns to read in parallel. Requires that the underlying file source is threadsafe metadata : FileMetaData If separately computed Returns ------- pyarrow.Table Content of the file as a Table of Columns, including DataFrame indexes as Columns. """ if is_string(source): fs = LocalFilesystem.get_instance() if fs.isdir(source): raise NotImplementedError( 'Reading a directory of Parquet files with DataFrame index ' 'metadata is not yet supported') pf = ParquetFile(source, metadata=metadata) return pf.read_pandas(columns=columns, nthreads=nthreads)
def __init__(self, path_or_paths, filesystem=None, schema=None, metadata=None, split_row_groups=False, validate_schema=True): if filesystem is None: self.fs = LocalFilesystem.get_instance() else: self.fs = filesystem self.paths = path_or_paths (self.pieces, self.partitions, self.metadata_path) = _make_manifest(path_or_paths, self.fs) self.metadata = metadata self.schema = schema self.split_row_groups = split_row_groups if split_row_groups: raise NotImplementedError("split_row_groups not yet implemented") if validate_schema: self.validate_schemas()
def __init__(self, dirpath, filesystem=None, pathsep='/', partition_scheme='hive'): self.filesystem = filesystem or LocalFilesystem.get_instance() self.pathsep = pathsep self.dirpath = dirpath self.partition_scheme = partition_scheme self.partitions = ParquetPartitions() self.pieces = [] self.common_metadata_path = None self.metadata_path = None self._visit_level(0, self.dirpath, [])
ArrowNotImplementedError, ArrowTypeError) from pyarrow.filesystem import Filesystem, HdfsClient, LocalFilesystem from pyarrow.io import (HdfsFile, NativeFile, PythonFileInterface, Buffer, BufferReader, InMemoryOutputStream, MemoryMappedFile, memory_map, frombuffer, read_tensor, write_tensor, memory_map, create_memory_map, get_record_batch_size, get_tensor_size) from pyarrow.ipc import FileReader, FileWriter, StreamReader, StreamWriter from pyarrow.memory import MemoryPool, total_allocated_bytes from pyarrow.scalar import (ArrayValue, Scalar, NA, NAType, BooleanValue, Int8Value, Int16Value, Int32Value, Int64Value, UInt8Value, UInt16Value, UInt32Value, UInt64Value, FloatValue, DoubleValue, ListValue, BinaryValue, StringValue, FixedSizeBinaryValue) import pyarrow.schema as _schema from pyarrow.schema import (null, bool_, int8, int16, int32, int64, uint8, uint16, uint32, uint64, timestamp, date32, date64, float16, float32, float64, binary, string, decimal, list_, struct, dictionary, field, DataType, FixedSizeBinaryType, Field, Schema, schema) from pyarrow.table import Column, RecordBatch, Table, concat_tables localfs = LocalFilesystem.get_instance()
pyarrow.set_memory_pool """ from pyarrow._jemalloc import default_pool return default_pool() from pyarrow.filesystem import Filesystem, HdfsClient, LocalFilesystem from pyarrow.ipc import (RecordBatchFileReader, RecordBatchFileWriter, RecordBatchStreamReader, RecordBatchStreamWriter, open_stream, open_file, serialize_pandas, deserialize_pandas) localfs = LocalFilesystem.get_instance() # ---------------------------------------------------------------------- # 0.4.0 deprecations import warnings def _deprecate_class(old_name, new_name, klass, next_version='0.5.0'): msg = ('pyarrow.{0} has been renamed to ' '{1}, will be removed in {2}' .format(old_name, new_name, next_version)) def deprecated_factory(*args, **kwargs): warnings.warn(msg, FutureWarning) return klass(*args) return deprecated_factory
NumericArray, IntegerArray, FloatingPointArray, BooleanArray, Int8Array, UInt8Array, Int16Array, UInt16Array, Int32Array, UInt32Array, Int64Array, UInt64Array, ListArray, StringArray, DictionaryArray) from pyarrow.error import ArrowException from pyarrow.filesystem import Filesystem, HdfsClient, LocalFilesystem from pyarrow.io import (HdfsFile, NativeFile, PythonFileInterface, Buffer, InMemoryOutputStream, BufferReader) from pyarrow.ipc import FileReader, FileWriter, StreamReader, StreamWriter from pyarrow.memory import MemoryPool, total_allocated_bytes from pyarrow.scalar import (ArrayValue, Scalar, NA, NAType, BooleanValue, Int8Value, Int16Value, Int32Value, Int64Value, UInt8Value, UInt16Value, UInt32Value, UInt64Value, FloatValue, DoubleValue, ListValue, BinaryValue, StringValue) from pyarrow.schema import (null, bool_, int8, int16, int32, int64, uint8, uint16, uint32, uint64, timestamp, date, float_, double, binary, string, list_, struct, dictionary, field, DataType, Field, Schema, schema) from pyarrow.table import Column, RecordBatch, Table, concat_tables localfs = LocalFilesystem()