def __init__(self, prefix_path, train_path=None, val_path=None, runs_path=None): self._fs = pa.LocalFileSystem() super(LocalStore, self).__init__(prefix_path, train_path=train_path, val_path=val_path, runs_path=runs_path)
def pa_fs(path): if path.startswith("hdfs"): # hdfs://url:port/file_path fs = pa.hdfs.connect() path = path[len("hdfs://"):] return path, fs elif path.startswith("s3"): raise ValueError("aws s3 is not supported for now") else: # Local path if path.startswith("file://"): path = path[len("file://"):] return path, pa.LocalFileSystem()
def parquet_file_schema(file_name): import pyarrow.parquet as pq import pyarrow as pa col_names = [] col_types = [] if file_name.startswith("hdfs://"): fs = pa.hdfs.connect() else: fs = pa.LocalFileSystem() with fs.open(file_name) as _file: f = pq.ParquetFile(_file) col_names = f.schema.names num_cols = len(col_names) col_types = [ _pq_type_to_numba[f.schema.column(i).physical_type] for i in range(num_cols) ] return col_names, col_types
def __init__(self, prefix_path, *args, **kwargs): self._fs = pa.LocalFileSystem() super(LocalStore, self).__init__(prefix_path, *args, **kwargs)