def __init__(self, dirpath, filesystem=None, pathsep='/', partition_scheme='hive', metadata_nthreads=1): self.filesystem = filesystem or _get_fs_from_path(dirpath) self.pathsep = pathsep self.dirpath = _stringify_path(dirpath) self.partition_scheme = partition_scheme self.partitions = ParquetPartitions() self.pieces = [] self._metadata_nthreads = metadata_nthreads self._thread_pool = futures.ThreadPoolExecutor( max_workers=metadata_nthreads) self.common_metadata_path = None self.metadata_path = None self._visit_level(0, self.dirpath, []) # Due to concurrency, pieces will potentially by out of order if the # dataset is partitioned so we sort them to yield stable results self.pieces.sort(key=lambda piece: piece.path) if self.common_metadata_path is None: # _common_metadata is a subset of _metadata self.common_metadata_path = self.metadata_path self._thread_pool.shutdown()
def read_table(source, columns=None, use_threads=True, metadata=None, use_pandas_metadata=False, memory_map=True): if _is_path_like(source): fs = _get_fs_from_path(source) return fs.read_parquet(source, columns=columns, use_threads=use_threads, metadata=metadata, use_pandas_metadata=use_pandas_metadata) pf = ParquetFile(source, metadata=metadata) return pf.read(columns=columns, use_threads=use_threads, use_pandas_metadata=use_pandas_metadata)
def __init__(self, path_or_paths, filesystem=None, schema=None, metadata=None, split_row_groups=False, validate_schema=True, filters=None, metadata_nthreads=1): if filesystem is None: a_path = path_or_paths if isinstance(a_path, list): a_path = a_path[0] self.fs = _get_fs_from_path(a_path) else: self.fs = _ensure_filesystem(filesystem) self.paths = path_or_paths (self.pieces, self.partitions, self.common_metadata_path, self.metadata_path) = _make_manifest( path_or_paths, self.fs, metadata_nthreads=metadata_nthreads) if self.common_metadata_path is not None: with self.fs.open(self.common_metadata_path) as f: self.common_metadata = ParquetFile(f).metadata else: self.common_metadata = None if metadata is None and self.metadata_path is not None: with self.fs.open(self.metadata_path) as f: self.metadata = ParquetFile(f).metadata else: self.metadata = metadata self.schema = schema self.split_row_groups = split_row_groups if split_row_groups: raise NotImplementedError("split_row_groups not yet implemented") if validate_schema: self.validate_schemas() if filters is not None: filters = _check_filters(filters) self._filter(filters)
def __init__(self, where, schema, flavor=None, version='1.0', use_dictionary=True, compression='snappy', use_deprecated_int96_timestamps=None, **options): if use_deprecated_int96_timestamps is None: # Use int96 timestamps for Spark if flavor is not None and 'spark' in flavor: use_deprecated_int96_timestamps = True else: use_deprecated_int96_timestamps = False self.flavor = flavor if flavor is not None: schema, self.schema_changed = _sanitize_schema(schema, flavor) else: self.schema_changed = False self.schema = schema self.where = where # If we open a file using an implied filesystem, so it can be assured # to be closed self.file_handle = None if _is_path_like(where): fs = _get_fs_from_path(where) sink = self.file_handle = fs.open(where, 'wb') else: sink = where self.writer = _parquet.ParquetWriter( sink, schema, version=version, compression=compression, use_dictionary=use_dictionary, use_deprecated_int96_timestamps=use_deprecated_int96_timestamps, **options) self.is_open = True
def write_to_dataset(table, root_path, partition_cols=None, filesystem=None, preserve_index=True, **kwargs): """ Wrapper around parquet.write_table for writing a Table to Parquet format by partitions. For each combination of partition columns and values, a subdirectories are created in the following manner: root_dir/ group1=value1 group2=value1 <uuid>.parquet group2=value2 <uuid>.parquet group1=valueN group2=value1 <uuid>.parquet group2=valueN <uuid>.parquet Parameters ---------- table : pyarrow.Table root_path : string, The root directory of the dataset filesystem : FileSystem, default None If nothing passed, paths assumed to be found in the local on-disk filesystem partition_cols : list, Column names by which to partition the dataset Columns are partitioned in the order they are given preserve_index : bool, Parameter for instantiating Table; preserve pandas index or not. **kwargs : dict, kwargs for write_table function. """ if filesystem is None: fs = _get_fs_from_path(root_path) else: fs = _ensure_filesystem(filesystem) _mkdir_if_not_exists(fs, root_path) if partition_cols is not None and len(partition_cols) > 0: df = table.to_pandas() partition_keys = [df[col] for col in partition_cols] data_df = df.drop(partition_cols, axis='columns') data_cols = df.columns.drop(partition_cols) if len(data_cols) == 0: raise ValueError('No data left to save outside partition columns') subschema = table.schema # ARROW-2891: Ensure the output_schema is preserved when writing a # partitioned dataset for partition_col in partition_cols: subschema = subschema.remove( subschema.get_field_index(partition_col)) for keys, subgroup in data_df.groupby(partition_keys): if not isinstance(keys, tuple): keys = (keys, ) subdir = '/'.join([ '{colname}={value}'.format(colname=name, value=val) for name, val in zip(partition_cols, keys) ]) subtable = pa.Table.from_pandas(subgroup, preserve_index=preserve_index, schema=subschema, safe=False) prefix = '/'.join([root_path, subdir]) _mkdir_if_not_exists(fs, prefix) outfile = guid() + '.parquet' full_path = '/'.join([prefix, outfile]) with fs.open(full_path, 'wb') as f: write_table(subtable, f, **kwargs) else: outfile = guid() + '.parquet' full_path = '/'.join([root_path, outfile]) with fs.open(full_path, 'wb') as f: write_table(table, f, **kwargs)
def __init__(self, path, key=None, secret=None, endpoint=None, proxy=None, proxy_port=None, filesystem=None): self.path = path self.url_path = urlparse(path) if str(path).endswith(".manifest"): self.manifest_path = path if str(path).startswith(LOCAL_FILE_PREFIX): self.manifest_path = str(path)[len(LOCAL_FILE_PREFIX):] if filesystem is None: a_path = self.path if isinstance(a_path, list): a_path = a_path[0] self.fs = _get_fs_from_path(a_path) else: self.fs = _ensure_filesystem(filesystem) self.pieces = list() if self.url_path.scheme == 's3a': if key is None or secret is None or endpoint is None: raise ValueError('key, secret, endpoint should not be None') if proxy is None and proxy_port is None: carbon_splits = ArrowCarbonReader().builder(self.path) \ .withHadoopConf("fs.s3a.access.key", key) \ .withHadoopConf("fs.s3a.secret.key", secret) \ .withHadoopConf("fs.s3a.endpoint", endpoint) \ .getSplits(True) configuration = Configuration() configuration.set("fs.s3a.access.key", key) configuration.set("fs.s3a.secret.key", secret) configuration.set("fs.s3a.endpoint", endpoint) self.configuration = configuration elif proxy is not None and proxy_port is not None: carbon_splits = ArrowCarbonReader().builder(self.path) \ .withHadoopConf("fs.s3a.access.key", key) \ .withHadoopConf("fs.s3a.secret.key", secret) \ .withHadoopConf("fs.s3a.endpoint", endpoint) \ .withHadoopConf("fs.s3a.proxy.host", proxy) \ .withHadoopConf("fs.s3a.proxy.port", proxy_port) \ .getSplits(True) configuration = Configuration() configuration.set("fs.s3a.access.key", key) configuration.set("fs.s3a.secret.key", secret) configuration.set("fs.s3a.endpoint", endpoint) configuration.set("fs.s3a.proxy.host", proxy) configuration.set("fs.s3a.proxy.port", proxy_port) self.configuration = configuration else: raise ValueError('wrong proxy & proxy_port configuration') if str(path).endswith(".manifest"): from obs import ObsClient obsClient = ObsClient(access_key_id=key, secret_access_key=secret, server=str(endpoint).replace( 'http://', ''), long_conn_mode=True) sources = manifest.getSources(self.manifest_path, CARBON, obsClient) if sources: self.file_path = sources[0] else: raise Exception("Manifest source can't be None!") carbon_schema = CarbonSchemaReader().readSchema( self.file_path, self.configuration.conf) else: carbon_schema = CarbonSchemaReader().readSchema( self.path, self.configuration.conf) for split in carbon_splits: # split = self.url_path.scheme + "://" + self.url_path.netloc + split folder_path = path if str(path).endswith(".manifest"): folder_path = str( self.file_path)[0:(str(self.file_path).rindex('/'))] self.pieces.append( CarbonDatasetPiece(folder_path, carbon_schema, split, key=key, secret=secret, endpoint=endpoint, proxy=proxy, proxy_port=proxy_port)) else: if str(path).endswith(".manifest"): sources = manifest.getSources(self.manifest_path, CARBON) if sources: self.file_path = sources[0] else: raise Exception("Manifest source can't be None!") try: carbon_schema = CarbonSchemaReader().readSchema( self.file_path) except: raise Exception("readSchema has some errors: " + self.file_path) else: try: carbon_schema = CarbonSchemaReader().readSchema(self.path) except: raise Exception("readSchema has some errors") carbon_splits = ArrowCarbonReader().builder(self.path) \ .getSplits(True) for split in carbon_splits: # split = self.url_path.scheme + "://" + self.url_path.netloc + split if str(path).endswith(".manifest"): self.pieces.append( CarbonDatasetPiece( str(self.file_path)[0:( str(self.file_path).rindex('/'))], carbon_schema, split)) else: self.pieces.append( CarbonDatasetPiece(path, carbon_schema, split)) self.number_of_splits = len(self.pieces) self.schema = self.getArrowSchema() # TODO add mechanism to get the file path based on file filter self.common_metadata_path = self.url_path.path + '/_common_metadata' self.common_metadata = None try: if self.fs.exists(self.common_metadata_path): with self.fs.open(self.common_metadata_path) as f: self.common_metadata = ParquetFile(f).metadata except: self.common_metadata = None
def __init__(self, path, key=None, secret=None, endpoint=None, proxy=None, proxy_port=None, filesystem=None): self.path = path self.url_path = urlparse(path) if filesystem is None: a_path = self.path if isinstance(a_path, list): a_path = a_path[0] self.fs = _get_fs_from_path(a_path) else: self.fs = _ensure_filesystem(filesystem) self.pieces = list() if self.url_path.scheme == 's3a': if key is None or secret is None or endpoint is None: raise ValueError('key, secret, endpoint should not be None') if proxy is None and proxy_port is None: carbon_splits = CarbonReader().builder(self.path) \ .withHadoopConf("fs.s3a.access.key", key) \ .withHadoopConf("fs.s3a.secret.key", secret) \ .withHadoopConf("fs.s3a.endpoint", endpoint) \ .getSplits() configuration = Configuration() configuration.set("fs.s3a.access.key", key) configuration.set("fs.s3a.secret.key", secret) configuration.set("fs.s3a.endpoint", endpoint) self.configuration = configuration elif proxy is not None and proxy_port is not None: carbon_splits = CarbonReader().builder(self.path) \ .withHadoopConf("fs.s3a.access.key", key) \ .withHadoopConf("fs.s3a.secret.key", secret) \ .withHadoopConf("fs.s3a.endpoint", endpoint) \ .withHadoopConf("fs.s3a.proxy.host", proxy) \ .withHadoopConf("fs.s3a.proxy.port", proxy_port) \ .getSplits() configuration = Configuration() configuration.set("fs.s3a.access.key", key) configuration.set("fs.s3a.secret.key", secret) configuration.set("fs.s3a.endpoint", endpoint) configuration.set("fs.s3a.proxy.host", proxy) configuration.set("fs.s3a.proxy.port", proxy_port) self.configuration = configuration else: raise ValueError('wrong proxy & proxy_port configuration') carbon_schema = CarbonSchemaReader().readSchema( self.path, self.configuration.conf) for split in carbon_splits: # split = self.url_path.scheme + "://" + self.url_path.netloc + split self.pieces.append( CarbonDatasetPiece(path, carbon_schema, split, key=key, secret=secret, endpoint=endpoint, proxy=proxy, proxy_port=proxy_port)) else: carbon_splits = CarbonReader().builder(self.path) \ .getSplits() carbon_schema = CarbonSchemaReader().readSchema(self.path) for split in carbon_splits: # split = self.url_path.scheme + "://" + self.url_path.netloc + split self.pieces.append( CarbonDatasetPiece(path, carbon_schema, split)) self.number_of_splits = len(self.pieces) self.schema = self.getArrowSchema() # TODO add mechanism to get the file path based on file filter self.common_metadata_path = self.url_path.path + '/_common_metadata' self.common_metadata = None try: if self.fs.exists(self.common_metadata_path): with self.fs.open(self.common_metadata_path) as f: self.common_metadata = ParquetFile(f).metadata except: self.common_metadata = None