def __init__(self, path_or_paths, filesystem=None, schema=None, metadata=None, split_row_groups=False, validate_schema=True): if filesystem is None: self.fs = LocalFileSystem.get_instance() else: self.fs = _ensure_filesystem(filesystem) self.paths = path_or_paths (self.pieces, self.partitions, self.metadata_path) = _make_manifest(path_or_paths, self.fs) if self.metadata_path is not None: with self.fs.open(self.metadata_path) as f: self.common_metadata = ParquetFile(f).metadata else: self.common_metadata = None self.metadata = metadata self.schema = schema self.split_row_groups = split_row_groups if split_row_groups: raise NotImplementedError("split_row_groups not yet implemented") if validate_schema: self.validate_schemas()
def _test_write_to_dataset_no_partitions(base_path, filesystem=None): # ARROW-1400 import pyarrow.parquet as pq output_df = pd.DataFrame({'group1': list('aaabbbbccc'), 'group2': list('eefeffgeee'), 'num': list(range(10)), 'date': np.arange('2017-01-01', '2017-01-11', dtype='datetime64[D]')}) cols = output_df.columns.tolist() output_table = pa.Table.from_pandas(output_df) if filesystem is None: filesystem = LocalFileSystem.get_instance() # Without partitions, append files to root_path n = 5 for i in range(n): pq.write_to_dataset(output_table, base_path, filesystem=filesystem) output_files = [file for file in filesystem.ls(base_path) if file.endswith(".parquet")] assert len(output_files) == n # Deduplicated incoming DataFrame should match # original outgoing Dataframe input_table = pq.ParquetDataset(base_path, filesystem=filesystem).read() input_df = input_table.to_pandas() input_df = input_df.drop_duplicates() input_df = input_df[cols] assert output_df.equals(input_df)
def __init__(self, dirpath, filesystem=None, pathsep='/', partition_scheme='hive'): self.filesystem = filesystem or LocalFileSystem.get_instance() self.pathsep = pathsep self.dirpath = dirpath self.partition_scheme = partition_scheme self.partitions = ParquetPartitions() self.pieces = [] self.common_metadata_path = None self.metadata_path = None self._visit_level(0, self.dirpath, [])
def _ensure_filesystem(fs): fs_type = type(fs) # If the arrow filesystem was subclassed, assume it supports the full # interface and return it if not issubclass(fs_type, FileSystem): for mro in inspect.getmro(fs_type): if mro.__name__ is 'S3FileSystem': return S3FSWrapper(fs) # In case its a simple LocalFileSystem (e.g. dask) use native arrow # FS elif mro.__name__ is 'LocalFileSystem': return LocalFileSystem.get_instance() raise IOError('Unrecognized filesystem: {0}'.format(fs_type)) else: return fs
def _get_fs_from_path(path): """ return filesystem from path which could be an HDFS URI """ # input can be hdfs URI such as hdfs://host:port/myfile.parquet if _has_pathlib and isinstance(path, pathlib.Path): path = str(path) parsed_uri = urlparse(path) if parsed_uri.scheme == 'hdfs': netloc_split = parsed_uri.netloc.split(':') host = netloc_split[0] if host == '': host = 'default' port = 0 if len(netloc_split) == 2 and netloc_split[1].isnumeric(): port = int(netloc_split[1]) fs = pa.hdfs.connect(host=host, port=port) else: fs = LocalFileSystem.get_instance() return fs
def read_table(source, columns=None, nthreads=1, metadata=None, use_pandas_metadata=False): """ Read a Table from Parquet format Parameters ---------- source: str or pyarrow.io.NativeFile Location of Parquet dataset. If a string passed, can be a single file name or directory name. For passing Python file objects or byte buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. columns: list If not None, only these columns will be read from the file. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e' nthreads : int, default 1 Number of columns to read in parallel. Requires that the underlying file source is threadsafe metadata : FileMetaData If separately computed use_pandas_metadata : boolean, default False If True and file has custom pandas schema metadata, ensure that index columns are also loaded Returns ------- pyarrow.Table Content of the file as a table (of columns) """ if is_string(source): fs = LocalFileSystem.get_instance() if fs.isdir(source): return fs.read_parquet(source, columns=columns, metadata=metadata) pf = ParquetFile(source, metadata=metadata) return pf.read(columns=columns, nthreads=nthreads, use_pandas_metadata=use_pandas_metadata)
warnings.warn("pyarrow.open_stream is deprecated, please use " "pyarrow.ipc.open_stream") return ipc.open_stream(source) def open_file(source): """ pyarrow.open_file deprecated since 0.12, use pyarrow.ipc.open_file """ import warnings warnings.warn("pyarrow.open_file is deprecated, please use " "pyarrow.ipc.open_file") return ipc.open_file(source) localfs = LocalFileSystem.get_instance() from pyarrow.serialization import (default_serialization_context, register_default_serialization_handlers, register_torch_serialization_handlers) import pyarrow.types as types # Entry point for starting the plasma store def _plasma_store_entry_point(): """Entry point for starting the plasma store. This can be used by invoking e.g. ``plasma_store -s /tmp/plasma -m 1000000000`` from the command line and will start the plasma_store executable with the
def test_read_partitioned_directory(tmpdir): fs = LocalFileSystem.get_instance() base_path = str(tmpdir) _partition_test_for_filesystem(fs, base_path)
def test_read_common_metadata_files(tmpdir): base_path = str(tmpdir) fs = LocalFileSystem.get_instance() _test_read_common_metadata_files(fs, base_path)
def write_to_dataset(table, root_path, partition_cols=None, filesystem=None, preserve_index=True, **kwargs): """ Wrapper around parquet.write_table for writing a Table to Parquet format by partitions. For each combination of partition columns and values, a subdirectories are created in the following manner: root_dir/ group1=value1 group2=value1 <uuid>.parquet group2=value2 <uuid>.parquet group1=valueN group2=value1 <uuid>.parquet group2=valueN <uuid>.parquet Parameters ---------- table : pyarrow.Table root_path : string, The root directory of the dataset filesystem : FileSystem, default None If nothing passed, paths assumed to be found in the local on-disk filesystem partition_cols : list, Column names by which to partition the dataset Columns are partitioned in the order they are given preserve_index : bool, Parameter for instantiating Table; preserve pandas index or not. **kwargs : dict, kwargs for write_table function. """ from pyarrow import ( Table, compat ) if filesystem is None: fs = LocalFileSystem.get_instance() else: fs = _ensure_filesystem(filesystem) _mkdir_if_not_exists(fs, root_path) if partition_cols is not None and len(partition_cols) > 0: df = table.to_pandas() partition_keys = [df[col] for col in partition_cols] data_df = df.drop(partition_cols, axis='columns') data_cols = df.columns.drop(partition_cols) if len(data_cols) == 0: raise ValueError("No data left to save outside partition columns") for keys, subgroup in data_df.groupby(partition_keys): if not isinstance(keys, tuple): keys = (keys,) subdir = "/".join( ["{colname}={value}".format(colname=name, value=val) for name, val in zip(partition_cols, keys)]) subtable = Table.from_pandas(subgroup, preserve_index=preserve_index) prefix = "/".join([root_path, subdir]) _mkdir_if_not_exists(fs, prefix) outfile = compat.guid() + ".parquet" full_path = "/".join([prefix, outfile]) with fs.open(full_path, 'wb') as f: write_table(subtable, f, **kwargs) else: outfile = compat.guid() + ".parquet" full_path = "/".join([root_path, outfile]) with fs.open(full_path, 'wb') as f: write_table(table, f, **kwargs)
DeserializationCallbackError) from pyarrow.filesystem import FileSystem, LocalFileSystem from pyarrow.hdfs import HadoopFileSystem import pyarrow.hdfs as hdfs from pyarrow.ipc import (Message, MessageReader, RecordBatchFileReader, RecordBatchFileWriter, RecordBatchStreamReader, RecordBatchStreamWriter, read_message, read_record_batch, read_schema, read_tensor, write_tensor, get_record_batch_size, get_tensor_size, open_stream, open_file, serialize_pandas, deserialize_pandas) localfs = LocalFileSystem.get_instance() from pyarrow.serialization import (default_serialization_context, register_default_serialization_handlers, register_torch_serialization_handlers) import pyarrow.types as types # Entry point for starting the plasma store def _plasma_store_entry_point(): """Entry point for starting the plasma store. This can be used by invoking e.g. ``plasma_store -s /tmp/plasma -m 1000000000``
def test_read_partitioned_directory(tmpdir): fs = LocalFileSystem.get_instance() base_path = str(tmpdir) _partition_test_for_filesystem(fs, base_path)
def test_read_common_metadata_files(tmpdir): base_path = str(tmpdir) fs = LocalFileSystem.get_instance() _test_read_common_metadata_files(fs, base_path)
from pyarrow.serialization import (default_serialization_context, register_default_serialization_handlers, register_torch_serialization_handlers) import pyarrow.types as types # deprecated top-level access from pyarrow.filesystem import FileSystem as _FileSystem from pyarrow.filesystem import LocalFileSystem as _LocalFileSystem from pyarrow.hdfs import HadoopFileSystem as _HadoopFileSystem from pyarrow.lib import SerializationContext as _SerializationContext from pyarrow.lib import SerializedPyObject as _SerializedPyObject _localfs = _LocalFileSystem._get_instance() _msg = ( "pyarrow.{0} is deprecated as of 2.0.0, please use pyarrow.fs.{1} instead." ) _serialization_msg = ( "'pyarrow.{0}' is deprecated and will be removed in a future version. " "Use pickle or the pyarrow IPC functionality instead.") _deprecated = { "localfs": (_localfs, "LocalFileSystem"), "FileSystem": (_FileSystem, "FileSystem"), "LocalFileSystem": (_LocalFileSystem, "LocalFileSystem"), "HadoopFileSystem": (_HadoopFileSystem, "HadoopFileSystem"), }
def write_to_dataset(table, root_path, partition_cols=None, filesystem=None, preserve_index=True, **kwargs): """ Wrapper around parquet.write_table for writing a Table to Parquet format by partitions. For each combination of partition columns and values, a subdirectories are created in the following manner: root_dir/ group1=value1 group2=value1 <uuid>.parquet group2=value2 <uuid>.parquet group1=valueN group2=value1 <uuid>.parquet group2=valueN <uuid>.parquet Parameters ---------- table : pyarrow.Table root_path : string, The root directory of the dataset filesystem : FileSystem, default None If nothing passed, paths assumed to be found in the local on-disk filesystem partition_cols : list, Column names by which to partition the dataset Columns are partitioned in the order they are given preserve_index : bool, Parameter for instantiating Table; preserve pandas index or not. **kwargs : dict, kwargs for write_table function. """ from pyarrow import ( Table, compat ) if filesystem is None: fs = LocalFileSystem.get_instance() else: fs = _ensure_filesystem(filesystem) if not fs.exists(root_path): fs.mkdir(root_path) if partition_cols is not None and len(partition_cols) > 0: df = table.to_pandas() partition_keys = [df[col] for col in partition_cols] data_df = df.drop(partition_cols, axis='columns') data_cols = df.columns.drop(partition_cols) if len(data_cols) == 0: raise ValueError("No data left to save outside partition columns") for keys, subgroup in data_df.groupby(partition_keys): if not isinstance(keys, tuple): keys = (keys,) subdir = "/".join( ["{colname}={value}".format(colname=name, value=val) for name, val in zip(partition_cols, keys)]) subtable = Table.from_pandas(subgroup, preserve_index=preserve_index) prefix = "/".join([root_path, subdir]) if not fs.exists(prefix): fs.mkdir(prefix) outfile = compat.guid() + ".parquet" full_path = "/".join([prefix, outfile]) with fs.open(full_path, 'wb') as f: write_table(subtable, f, **kwargs) else: outfile = compat.guid() + ".parquet" full_path = "/".join([root_path, outfile]) with fs.open(full_path, 'wb') as f: write_table(table, f, **kwargs)