import os, glob, warnings import numpy as np import numba as nb import pandas as pd import pyarrow.parquet as pq import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import seaborn as sns from load_config import c, config if __name__ == '__main__': filelist = sorted(glob.glob(f"../pq/fdim_hres_dump_*.pq")) df = pq.ParquetDataset(filelist).read(nthreads=16).to_pandas() #---- Plotting fig = plt.figure(1, figsize=(4.5, 3)) fig.clf() sns.set_context('paper') sns.set_style( 'ticks', { 'axes.grid': False, 'axes.linewidth': '0.75', 'grid.color': '0.75', 'grid.linestyle': u':', 'legend.frameon': True, }) plt.rc('text', usetex=True) plt.rc('font', family='Serif')
def _read_pyarrow( fs, fs_token, paths, columns=None, filters=None, categories=None, index=None, infer_divisions=None, ): from ...bytes.core import get_pyarrow_filesystem import pyarrow.parquet as pq # In pyarrow, the physical storage field names may differ from # the actual dataframe names. This is true for Index names when # PyArrow >= 0.8. # We would like to resolve these to the correct dataframe names # as soon as possible. if isinstance(categories, string_types): categories = [categories] elif categories is None: categories = [] else: categories = list(categories) if isinstance(columns, tuple): columns = list(columns) dataset = pq.ParquetDataset( paths, filesystem=get_pyarrow_filesystem(fs), filters=filters ) if dataset.partitions is not None: partitions = [n for n in dataset.partitions.partition_names if n is not None] else: partitions = [] schema = dataset.schema.to_arrow_schema() has_pandas_metadata = schema.metadata is not None and b"pandas" in schema.metadata if has_pandas_metadata: pandas_metadata = json.loads(schema.metadata[b"pandas"].decode("utf8")) index_names, column_names, storage_name_mapping, column_index_names = _parse_pandas_metadata( pandas_metadata ) else: index_names = [] column_names = schema.names storage_name_mapping = {k: k for k in column_names} column_index_names = [None] column_names += [p for p in partitions if p not in column_names] column_names, index_names, out_type = _normalize_index_columns( columns, column_names, index, index_names ) all_columns = index_names + column_names # Find non-empty pieces non_empty_pieces = [] # Determine valid pieces _open = lambda fn: pq.ParquetFile(fs.open(fn, mode="rb")) for piece in dataset.pieces: pf = piece.get_metadata(_open) # non_empty_pieces.append(piece) if pf.num_row_groups > 0: non_empty_pieces.append(piece) # Sort pieces naturally # If a single input path resulted in multiple dataset pieces, then sort # the pieces naturally. If multiple paths were supplied then we leave # the order of the resulting pieces unmodified if len(paths) == 1 and len(dataset.pieces) > 1: non_empty_pieces = sorted( non_empty_pieces, key=lambda piece: natural_sort_key(piece.path) ) # Determine divisions if len(index_names) == 1: # Look up storage name of the single index column divisions_names = [ storage_name for storage_name, name in storage_name_mapping.items() if index_names[0] == name ] if divisions_names: divisions_name = divisions_names[0] else: divisions_name = None else: divisions_name = None divisions = _get_pyarrow_divisions( non_empty_pieces, divisions_name, schema, infer_divisions ) # Build task dtypes = _get_pyarrow_dtypes(schema, categories) dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()} meta = _meta_from_dtypes(all_columns, dtypes, index_names, column_index_names) meta = clear_known_categories(meta, cols=categories) if out_type == Series: assert len(meta.columns) == 1 meta = meta[meta.columns[0]] task_name = "read-parquet-" + tokenize(fs_token, paths, all_columns) if non_empty_pieces: task_plan = { (task_name, i): ( _read_pyarrow_parquet_piece, fs, piece, column_names, index_names, out_type == Series, dataset.partitions, categories, ) for i, piece in enumerate(non_empty_pieces) } else: meta = strip_unknown_categories(meta) task_plan = {(task_name, 0): meta} return out_type(task_plan, task_name, meta, divisions)
def save_data(run_all=True): for dataset_item in db.get_all(data_source='Chicago Data Portal'): print dataset_item dataset = dataset_item['dataset'] if dataset == 'business_grants': print dataset today = datetime.datetime.today().date() date_list = set([today.strftime('%Y-%m')]) date_list.add( (today - datetime.timedelta(days=32)).strftime('%Y-%m')) date_list = sorted( list( set([(today - datetime.timedelta(days=x)).strftime('%Y-%m') for x in range(32)]))) paths = [] if run_all: paths = ['bnroths/chicago-data/%s' % dataset] cnts = {} else: for month in date_list: year, month = month.split('-') paths.append('bnroths/chicago-data/%s/year=%s/month=%s' % (dataset, year, month)) print paths cnts = datasets[dataset]['cnts'] # exit(0) print paths for path in paths: ds = pq.ParquetDataset(path_or_paths=path, filesystem=S3FS, validate_schema=False) # print datasets[dataset].keys() columns = dataset_item['columns'] dt = columns[1] table = ds.read() df = table.to_pandas() print df.columns print df.head() # exit(0) df['dt'] = df[dt].astype(str).str[:7] groups = dict(list(df.groupby('dt'))) print groups.keys() # exit(0) for group in groups: print group, type(group) if group != "None": # there is seriously a blank date year, month = group.split('-') a = groups[group][['longitude', 'latitude' ]].to_json(orient='values') if dataset == 'building_permits': if group >= '2016': cnts[group] = groups[group].count()[0] elif dataset == 'business_liscenses': if group >= '2002': cnts[group] = groups[group].count()[0] else: cnts[group] = groups[group].count()[0] filename = '../data/%s/%s-%s/all.json' % (dataset, year, month) if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise with open(filename, 'w') as f: f.write(a) ## write to s3 s3.save_file_public(local='../data/%s/%s-%s/all.json' % (dataset, year, month), dataset=dataset, dt="%s-%s" % (year, month), filename='all.json') db.update_col(dataset=dataset, col='cnts', update=json.dumps(cnts)) return None
parser.add_argument('--print-values', action='store_true', help='Print index values (dataset piece indexes)') parser.add_argument('--skip-index', nargs='+', type=str, help='Donot display indexed values for given fields') parser.add_argument('--hdfs-driver', type=str, default='libhdfs3', help='A string denoting the hdfs driver to use (if using a dataset on hdfs). ' 'Current choices are libhdfs (java through JNI) or libhdfs3 (C++)') args = parser.parse_args() if args.dataset_url and args.dataset_url[-1] == '/': args.dataset_url = args.dataset_url[:-1] # Create pyarrow file system resolver = FilesystemResolver(args.dataset_url, hdfs_driver=args.hdfs_driver) dataset = pq.ParquetDataset(resolver.get_dataset_path(), filesystem=resolver.filesystem(), validate_schema=False) print_all = not args.schema and not args.index if args.schema or print_all: print('*** Schema from dataset metadata ***') print((dataset_metadata.get_schema(dataset))) if args.index or print_all: index_dict = rowgroup_indexing.get_row_group_indexes(dataset) print('*** Row group indexes from dataset metadata ***') for index_name in index_dict: print(('Index: {}'.format(index_name))) if args.skip_index is None or index_name not in args.skip_index: for field_value in index_dict[index_name].indexed_values: print(' -- {}({})'.format(field_value, len(index_dict[index_name].get_row_group_indexes(field_value))))
def print_parquet_pandas_shape(bucket_uri, file_system): dataset = pq.ParquetDataset(bucket_uri, filesystem=file_system) table = dataset.read() df = table.to_pandas() print(df.shape)
def read_parquet(paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, columns: Optional[List[str]] = None, parallelism: int = 200, **arrow_parquet_args) -> Dataset[ArrowRow]: """Create an Arrow dataset from parquet files. Examples: # Read a directory of files in remote storage. >>> ds.read_parquet("s3://bucket/path") # Read multiple local files. >>> ds.read_parquet(["/path/to/file1", "/path/to/file2"]) Args: paths: A single file path or a list of file paths (or directories). filesystem: The filesystem implementation to read from. columns: A list of column names to read. parallelism: The amount of parallelism to use for the dataset. arrow_parquet_args: Other parquet read options to pass to pyarrow. Returns: Dataset holding Arrow records read from the specified paths. """ import pyarrow.parquet as pq pq_ds = pq.ParquetDataset(paths, **arrow_parquet_args) read_tasks = [[] for _ in builtins.range(parallelism)] # TODO(ekl) support reading row groups (maybe as an option) for i, piece in enumerate(pq_ds.pieces): read_tasks[i % len(read_tasks)].append(piece) nonempty_tasks = [r for r in read_tasks if r] partitions = pq_ds.partitions @ray.remote def gen_read(pieces: List[pq.ParquetDatasetPiece]): import pyarrow logger.debug("Reading {} parquet pieces".format(len(pieces))) tables = [ piece.read(columns=columns, use_threads=False, partitions=partitions) for piece in pieces ] if len(tables) > 1: table = pyarrow.concat_tables(tables) else: table = tables[0] return ArrowBlock(table) calls: List[Callable[[], ObjectRef[Block]]] = [] metadata: List[BlockMetadata] = [] for pieces in nonempty_tasks: calls.append(lambda pieces=pieces: gen_read.remote(pieces)) piece_metadata = [p.get_metadata() for p in pieces] metadata.append( BlockMetadata(num_rows=sum(m.num_rows for m in piece_metadata), size_bytes=sum( sum( m.row_group(i).total_byte_size for i in builtins.range(m.num_row_groups)) for m in piece_metadata), schema=piece_metadata[0].schema.to_arrow_schema(), input_files=[p.path for p in pieces])) return Dataset(LazyBlockList(calls, metadata))
def _determine_dataset_parts(fs, paths, gather_statistics, filters, dataset_kwargs): """ Determine how to access metadata and break read into ``parts`` This logic is mostly to handle `gather_statistics=False` cases, because this also means we should avoid scanning every file in the dataset. """ parts = [] if len(paths) > 1: base, fns = _analyze_paths(paths, fs) if "_metadata" in fns: # We have a _metadata file # PyArrow cannot handle "_metadata" # when `paths` is a list. paths.remove(base + fs.sep + "_metadata") fns.remove("_metadata") if gather_statistics is not False: # If we are allowed to gather statistics, # lets use "_metadata" instead of opening # every file. Note that we don't need to check if # the dataset is flat here, because PyArrow cannot # properly handle partitioning in this case anyway. dataset = pq.ParquetDataset( base + fs.sep + "_metadata", filesystem=fs, filters=filters, **dataset_kwargs, ) dataset.metadata = dataset.pieces[0].get_metadata() dataset.pieces = [SimplePiece(path) for path in paths] dataset.partitions = None return parts, dataset if gather_statistics is not False: # This scans all the files dataset = pq.ParquetDataset(paths, filesystem=fs, filters=filters, **dataset_kwargs) if dataset.schema is None: # The dataset may have inconsistent schemas between files. # If so, we should try to use a "_common_metadata" file proxy_path = (base + fs.sep + "_common_metadata" if "_common_metadata" in fns else paths[0]) dataset.schema = pq.ParquetDataset(proxy_path, filesystem=fs).schema else: # Rely on schema for 0th file. # Will need to pass a list of paths to read_partition dataset = pq.ParquetDataset(paths[0], filesystem=fs, **dataset_kwargs) parts = [base + fs.sep + fn for fn in fns] elif fs.isdir(paths[0]): # This is a directory, check for _metadata, then _common_metadata allpaths = fs.glob(paths[0] + fs.sep + "*") base, fns = _analyze_paths(allpaths, fs) # Check if dataset is "not flat" (partitioned into directories). # If so, we will need to let pyarrow generate the `dataset` object. not_flat = any( [fs.isdir(p) for p in fs.glob(fs.sep.join([base, "*"]))]) if "_metadata" in fns and "validate_schema" not in dataset_kwargs: dataset_kwargs["validate_schema"] = False if not_flat or "_metadata" in fns or gather_statistics is not False: # Let arrow do its thing (use _metadata or scan files) dataset = pq.ParquetDataset(paths, filesystem=fs, filters=filters, **dataset_kwargs) if dataset.schema is None: # The dataset may have inconsistent schemas between files. # If so, we should try to use a "_common_metadata" file proxy_path = (base + fs.sep + "_common_metadata" if "_common_metadata" in fns else allpaths[0]) dataset.schema = pq.ParquetDataset(proxy_path, filesystem=fs).schema else: # Use _common_metadata file if it is available. # Otherwise, just use 0th file if "_common_metadata" in fns: dataset = pq.ParquetDataset(base + fs.sep + "_common_metadata", filesystem=fs, **dataset_kwargs) else: dataset = pq.ParquetDataset(allpaths[0], filesystem=fs, **dataset_kwargs) parts = [ base + fs.sep + fn for fn in fns if fn != "_common_metadata" ] else: # There is only one file to read dataset = pq.ParquetDataset(paths, filesystem=fs, **dataset_kwargs) return parts, dataset
def read_parquet(self, filename, to_df=True, **pq_kwargs): prqt_name = 's3://{bucket}/{filename}'.format(bucket=self.bucket_name, filename=filename) prqt = pq.ParquetDataset(prqt_name, filesystem=self.s3fs) return prqt.read_pandas(**pq_kwargs).to_pandas() if to_df else prqt
def read_multiple_files(paths, columns=None, nthreads=None, **kwargs): dataset = pq.ParquetDataset(paths, **kwargs) return dataset.read(columns=columns, nthreads=nthreads)
def get_dataset(key): return pq.ParquetDataset(key).read_pandas().to_pandas()
def _read_pyarrow(fs, fs_token, paths, columns=None, filters=None, categories=None, index=None): from ...bytes.core import get_pyarrow_filesystem import pyarrow.parquet as pq import pyarrow as pa # In pyarrow, the physical storage field names may differ from # the actual dataframe names. This is true for Index names when # PyArrow >= 0.8. # We would like to resolve these to the correct dataframe names # as soon as possible. if filters is not None: raise NotImplementedError("Predicate pushdown not implemented") if isinstance(categories, string_types): categories = [categories] elif categories is None: categories = [] else: categories = list(categories) if isinstance(columns, tuple): columns = list(columns) dataset = pq.ParquetDataset(paths, filesystem=get_pyarrow_filesystem(fs)) schema = dataset.schema.to_arrow_schema() has_pandas_metadata = schema.metadata is not None and b'pandas' in schema.metadata if has_pandas_metadata: pandas_metadata = json.loads(schema.metadata[b'pandas'].decode('utf8')) index_names, column_names, storage_name_mapping, column_index_names = ( _parse_pandas_metadata(pandas_metadata)) else: index_names = [] column_names = schema.names storage_name_mapping = {k: k for k in column_names} column_index_names = [None] if pa.__version__ < distutils.version.LooseVersion('0.8.0'): # the pyarrow 0.7.0 *reader* expects the storage names for index names # that are None. if any(x is None for x in index_names): name_storage_mapping = { v: k for k, v in storage_name_mapping.items() } index_names = [ name_storage_mapping.get(name, name) for name in index_names ] column_names, index_names, out_type = _normalize_index_columns( columns, column_names, index, index_names) all_columns = index_names + column_names dtypes = _get_pyarrow_dtypes(schema, categories) dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()} meta = _meta_from_dtypes(all_columns, dtypes, index_names, column_index_names) meta = clear_known_categories(meta, cols=categories) if out_type == Series: assert len(meta.columns) == 1 meta = meta[meta.columns[0]] task_name = 'read-parquet-' + tokenize(fs_token, paths, all_columns) if dataset.pieces: divisions = (None, ) * (len(dataset.pieces) + 1) task_plan = {(task_name, i): (_read_pyarrow_parquet_piece, fs, piece, column_names, index_names, out_type == Series, dataset.partitions, categories) for i, piece in enumerate(dataset.pieces)} else: meta = strip_unknown_categories(meta) divisions = (None, None) task_plan = {(task_name, 0): meta} return out_type(task_plan, task_name, meta, divisions)
def initialize_write( df, fs, path, append=False, partition_on=None, ignore_divisions=False, division_info=None, schema=None, index_cols=None, **kwargs, ): # Infer schema if "infer" # (also start with inferred schema if user passes a dict) if schema == "infer" or isinstance(schema, dict): # Start with schema from _meta_nonempty _schema = pa.Schema.from_pandas( df._meta_nonempty.set_index(index_cols) if index_cols else df._meta_nonempty ) # Use dict to update our inferred schema if isinstance(schema, dict): schema = pa.schema(schema) for name in schema.names: i = _schema.get_field_index(name) j = schema.get_field_index(name) _schema = _schema.set(i, schema.field(j)) # If we have object columns, we need to sample partitions # until we find non-null data for each column in `sample` sample = [col for col in df.columns if df[col].dtype == "object"] if schema_field_supported and sample and schema == "infer": delayed_schema_from_pandas = delayed(pa.Schema.from_pandas) for i in range(df.npartitions): # Keep data on worker _s = delayed_schema_from_pandas( df[sample].to_delayed()[i] ).compute() for name, typ in zip(_s.names, _s.types): if typ != "null": i = _schema.get_field_index(name) j = _s.get_field_index(name) _schema = _schema.set(i, _s.field(j)) sample.remove(name) if not sample: break # Final (inferred) schema schema = _schema dataset = fmd = None i_offset = 0 if append and division_info is None: ignore_divisions = True fs.mkdirs(path, exist_ok=True) if append: try: # Allow append if the dataset exists. # Also need dataset.metadata object if # ignore_divisions is False (to check divisions) dataset = pq.ParquetDataset(path, filesystem=fs) if not dataset.metadata and not ignore_divisions: # TODO: Be more flexible about existing metadata. raise NotImplementedError( "_metadata file needed to `append` " "with `engine='pyarrow'` " "unless `ignore_divisions` is `True`" ) fmd = dataset.metadata except (IOError, ValueError, IndexError): # Original dataset does not exist - cannot append append = False if append: names = dataset.metadata.schema.names has_pandas_metadata = ( dataset.schema.to_arrow_schema().metadata is not None and b"pandas" in dataset.schema.to_arrow_schema().metadata ) if has_pandas_metadata: pandas_metadata = json.loads( dataset.schema.to_arrow_schema().metadata[b"pandas"].decode("utf8") ) categories = [ c["name"] for c in pandas_metadata["columns"] if c["pandas_type"] == "categorical" ] else: categories = None dtypes = _get_pyarrow_dtypes(dataset.schema.to_arrow_schema(), categories) if set(names) != set(df.columns) - set(partition_on): raise ValueError( "Appended columns not the same.\n" "Previous: {} | New: {}".format(names, list(df.columns)) ) elif (pd.Series(dtypes).loc[names] != df[names].dtypes).any(): # TODO Coerce values for compatible but different dtypes raise ValueError( "Appended dtypes differ.\n{}".format( set(dtypes.items()) ^ set(df.dtypes.iteritems()) ) ) i_offset = len(dataset.pieces) if division_info["name"] not in names: ignore_divisions = True if not ignore_divisions: old_end = None row_groups = [ dataset.metadata.row_group(i) for i in range(dataset.metadata.num_row_groups) ] for row_group in row_groups: for i, name in enumerate(names): if name != division_info["name"]: continue column = row_group.column(i) if column.statistics: if not old_end: old_end = column.statistics.max else: old_end = max(old_end, column.statistics.max) break divisions = division_info["divisions"] if divisions[0] < old_end: raise ValueError( "Appended divisions overlapping with the previous ones" " (set ignore_divisions=True to append anyway).\n" "Previous: {} | New: {}".format(old_end, divisions[0]) ) return fmd, schema, i_offset
import pyarrow.parquet as pq import s3fs file = "C:\\Users\\mlodhi\\OneDrive - Nice Systems Ltd\\Desktop\\Python Pract\\python_practice\\PractPackage\\files\\gender.parquet" pq_data = pq.ParquetDataset(file) reader = pq_data.read_pandas() # print(type(reader)) # print(reader) # print("Row count : ", reader.num_rows) # print("Column count : ", reader.num_columns) # print("Column names : ", reader.column_names) pq_df = reader.to_pandas() # print("Read data : \n", pq_df) data = pq_data.read() # print("Row count : ", data.num_rows) # print("Column count : ", data.num_columns) # print("Column names : ", data.column_names) dataframe = data.to_pandas() # print("Read data : \n", dataframe) # print(type(dataframe)) # print(type(pq_df)) # print(dataframe.shape) # print(pq_df.shape) # print(pq_df.isnull().any()) # print(pq_df.isna().any()) # print(pq_df.index) # print(pq_df.columns) # print(pq_df.GENDER_CD) # print(pq_df['GENDER_CD'])
def __init__(self, pyarrow_filesystem, dataset_path, schema_fields=None, shuffle_row_groups=True, shuffle_row_drop_partitions=1, predicate=None, rowgroup_selector=None, reader_pool=None, num_epochs=1, cur_shard=None, shard_count=None, cache=None, infer_schema=False): """Initializes a reader object. :param pyarrow_filesystem: An instance of ``pyarrow.FileSystem`` that will be used. If not specified, then a default one will be selected based on the url (only for ``hdfs://`` or ``file://``; for ``s3://`` support, use ``make_reader``). The default hdfs driver is ``libhdfs3``. If you want to to use ``libhdfs``, use ``pyarrow_filesystem=pyarrow.hdfs.connect('hdfs:///some/path', driver='libhdfs')``. :param dataset_path: filepath to a parquet directory on the specified filesystem. e.g. ``'/user/yevgeni/parquet8'``, or ``'/tmp/mydataset'``. :param schema_fields: Either list of unischema fields to subset, or ``None`` to read all fields. OR an NGram object, then it will return an NGram of the specified properties. :param shuffle_row_groups: Whether to shuffle row groups (the order in which full row groups are read) :param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to break up a row group into for increased shuffling in exchange for worse performance (extra reads). For example if you specify 2 each row group read will drop half of the rows within every row group and read the remaining rows in separate reads. It is recommended to keep this number below the regular row group size in order to not waste reads which drop all rows. :param predicate: instance of predicate object to filter rows to be returned by reader. :param rowgroup_selector: instance of row group selector object to select row groups to be read :param reader_pool: parallelization pool. ``ThreadPool(10)`` (10 threads) is used by default. This pool is a custom implementation used to parallelize reading data from the dataset. Any object from workers_pool package can be used (e.g. :class:`petastorm.workers_pool.process_pool.ProcessPool`). :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to ``None`` will result in an infinite number of epochs. :param cur_shard: An int denoting the current shard number used. Each reader instance should pass in a unique shard number in the range ``[0, shard_count)``. ``shard_count`` must be supplied as well. Defaults to None :param shard_count: An int denoting the number of shard partitions there are. Defaults to None :param cache: An object conforming to :class:`.CacheBase` interface. Before loading row groups from a parquet file the Reader will attempt to load these values from cache. Caching is useful when communication to the main data store is either slow or expensive and the local machine has large enough storage to store entire dataset (or a partition of a dataset if shards are used). By default, use the :class:`.NullCache` implementation. """ # 1. Open the parquet storage (dataset) # 2. Get a list of all groups # 3. Filter rowgroups # a. predicates # b. row-group selector (our indexing mechanism) # c. partition: used to get a subset of data for distributed training # 4. Create a rowgroup ventilator object # 5. Start workers pool if not (isinstance(schema_fields, collections.Iterable) or isinstance(schema_fields, NGram) or schema_fields is None): raise ValueError("""Fields must be either None, an iterable collection of Unischema fields or an NGram object.""") self.ngram = schema_fields if isinstance(schema_fields, NGram) else None if self.ngram and not self.ngram.timestamp_overlap and shuffle_row_drop_partitions > 1: raise NotImplementedError('Using timestamp_overlap=False is not implemented with' ' shuffle_options.shuffle_row_drop_partitions > 1') cache = cache or NullCache() self._workers_pool = reader_pool or ThreadPool(10) # 1. Resolve dataset path (hdfs://, file://) and open the parquet storage (dataset) self.dataset = pq.ParquetDataset(dataset_path, filesystem=pyarrow_filesystem, validate_schema=False) if infer_schema: # If inferring schema, just retrieve the schema from a file of the dataset meta = self.dataset.pieces[0].get_metadata(self.dataset.fs.open) arrow_schema = meta.schema.to_arrow_schema() stored_schema = Unischema.from_arrow_schema(arrow_schema) else: # Otherwise, get the stored schema stored_schema = dataset_metadata.get_schema(self.dataset) # Make a schema view (a view is a Unischema containing only a subset of fields # Will raise an exception if invalid schema fields are in schema_fields fields = schema_fields if isinstance(schema_fields, collections.Iterable) else None self.schema = stored_schema.create_schema_view(fields) if fields else stored_schema # 2. Get a list of all groups row_groups = dataset_metadata.load_row_groups(self.dataset, infer_schema) # 3. Filter rowgroups filtered_row_group_indexes, worker_predicate = self._filter_row_groups(self.dataset, row_groups, predicate, rowgroup_selector, cur_shard, shard_count) # 4. Create a rowgroup ventilator object normalized_shuffle_row_drop_partitions = \ self._normalize_shuffle_options(shuffle_row_drop_partitions, self.dataset) ventilator = self._create_ventilator(filtered_row_group_indexes, shuffle_row_groups, normalized_shuffle_row_drop_partitions, num_epochs, worker_predicate, self._workers_pool.workers_count + _VENTILATE_EXTRA_ROWGROUPS) # 5. Start workers pool self._workers_pool.start(ReaderWorker, (pyarrow_filesystem, dataset_path, self.schema, self.ngram, row_groups, cache), ventilator=ventilator) logger.debug('Workers pool started') self.last_row_consumed = False # _result self._result_buffer = []
def process(self, piece_index, worker_predicate, shuffle_row_drop_partition): """Main worker function. Loads and returns all rows matching the predicate from a rowgroup Looks up the requested piece (a single row-group in a parquet file). If a predicate is specified, columns needed by the predicate are loaded first. If no rows in the rowgroup matches the predicate criteria the rest of the columns are not loaded. :param piece_index: :param shuffle_row_drop_partition: A tuple 2 of the current row drop partition and the total number of partitions. :return: """ if not self._dataset: self._dataset = pq.ParquetDataset(self._dataset_path_or_paths, filesystem=self._filesystem, validate_schema=False, filters=self._arrow_filters) if self._dataset.partitions is None: # When read from parquet file list, the `dataset.partitions` will be None. # But other petastorm code require at least an empty `ParquetPartitions` object. self._dataset.partitions = pq.ParquetPartitions() piece = self._split_pieces[piece_index] # Create pyarrow file system parquet_file = ParquetFile(self._dataset.fs.open(piece.path)) if not isinstance(self._local_cache, NullCache): if worker_predicate: raise RuntimeError( 'Local cache is not supported together with predicates, ' 'unless the dataset is partitioned by the column the predicate operates on.' ) if shuffle_row_drop_partition[1] != 1: raise RuntimeError( 'Local cache is not supported together with shuffle_row_drop_partitions > 1' ) if worker_predicate: all_cols = self._load_rows_with_predicate( parquet_file, piece, worker_predicate, shuffle_row_drop_partition) else: # Using hash of the dataset path with the relative path in order to: # 1. Make sure if a common cache serves multiple processes (e.g. redis), we don't have conflicts # 2. Dataset path is hashed, to make sure we don't create too long keys, which maybe incompatible with # some cache implementations # 3. Still leave relative path and the piece_index in plain text to make it easier to debug if isinstance(self._dataset_path_or_paths, list): path_str = ','.join(self._dataset_path_or_paths) else: path_str = self._dataset_path_or_paths cache_key = '{}:{}:{}'.format( hashlib.md5(path_str.encode('utf-8')).hexdigest(), piece.path, piece_index) all_cols = self._local_cache.get( cache_key, lambda: self._load_rows(parquet_file, piece, shuffle_row_drop_partition)) if all_cols: self.publish_func(all_cols)
def load_chunk_as_tensor(self, chunk_idx): print('Loading chunk %d from disk.' % chunk_idx) chunk = pq.ParquetDataset(self[chunk_idx]).read_pandas() chunk = chunk.to_pandas() chunk = torch.from_numpy(chunk.values) return chunk
def prepare_read(self, parallelism: int, paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, columns: Optional[List[str]] = None, schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, _block_udf: Optional[Callable[[Block], Block]] = None, **reader_args) -> List[ReadTask]: """Creates and returns read tasks for a Parquet file-based datasource. """ # NOTE: We override the base class FileBasedDatasource.prepare_read # method in order to leverage pyarrow's ParquetDataset abstraction, # which simplifies partitioning logic. We still use # FileBasedDatasource's write side (do_write), however. _check_pyarrow_version() from ray import cloudpickle import pyarrow as pa import pyarrow.parquet as pq import numpy as np paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) if len(paths) == 1: paths = paths[0] dataset_kwargs = reader_args.pop("dataset_kwargs", {}) pq_ds = pq.ParquetDataset(paths, **dataset_kwargs, filesystem=filesystem, use_legacy_dataset=False) if schema is None: schema = pq_ds.schema if columns: schema = pa.schema([schema.field(column) for column in columns], schema.metadata) def read_pieces(serialized_pieces: List[str]) -> pa.Table: # Implicitly trigger S3 subsystem initialization by importing # pyarrow.fs. import pyarrow.fs # noqa: F401 # Deserialize after loading the filesystem class. pieces: List["pyarrow._dataset.ParquetFileFragment"] = [ cloudpickle.loads(p) for p in serialized_pieces ] # Ensure that we're reading at least one dataset fragment. assert len(pieces) > 0 from pyarrow.dataset import _get_partition_keys logger.debug(f"Reading {len(pieces)} parquet pieces") use_threads = reader_args.pop("use_threads", False) tables = [] for piece in pieces: table = piece.to_table(use_threads=use_threads, columns=columns, schema=schema, **reader_args) part = _get_partition_keys(piece.partition_expression) if part: for col, value in part.items(): table = table.set_column( table.schema.get_field_index(col), col, pa.array([value] * len(table))) # If the table is empty, drop it. if table.num_rows > 0: tables.append(table) if len(tables) > 1: table = pa.concat_tables(tables, promote=True) elif len(tables) == 1: table = tables[0] if _block_udf is not None: table = _block_udf(table) # If len(tables) == 0, all fragments were empty, and we return the # empty table from the last fragment. return table if _block_udf is not None: # Try to infer dataset schema by passing dummy table through UDF. dummy_table = schema.empty_table() try: inferred_schema = _block_udf(dummy_table).schema inferred_schema = inferred_schema.with_metadata( schema.metadata) except Exception: logger.debug( "Failed to infer schema of dataset by passing dummy table " "through UDF due to the following exception:", exc_info=True) inferred_schema = schema else: inferred_schema = schema read_tasks = [] serialized_pieces = [cloudpickle.dumps(p) for p in pq_ds.pieces] if len(pq_ds.pieces) > PARALLELIZE_META_FETCH_THRESHOLD: metadata = _fetch_metadata_remotely(serialized_pieces) else: metadata = _fetch_metadata(pq_ds.pieces) for piece_data in np.array_split( list(zip(pq_ds.pieces, serialized_pieces, metadata)), parallelism): if len(piece_data) == 0: continue pieces, serialized_pieces, metadata = zip(*piece_data) meta = _build_block_metadata(pieces, metadata, inferred_schema) read_tasks.append( ReadTask( lambda pieces_=serialized_pieces: [read_pieces(pieces_)], meta)) return read_tasks
def load_parquet_s3(file_system, bucket, file): s3_path = 's3://{}/{}'.format(bucket, file) dataset = pq.ParquetDataset(s3_path, filesystem=s3) df = dataset.read_pandas().to_pandas() return df
def _read_pyarrow(fs, paths, file_opener, columns=None, filters=None, categories=None, index=None): from ...bytes.core import get_pyarrow_filesystem import pyarrow.parquet as pq if filters is not None: raise NotImplementedError("Predicate pushdown not implemented") if categories is not None: raise NotImplementedError("Categorical reads not yet implemented") if isinstance(columns, tuple): columns = list(columns) dataset = pq.ParquetDataset(paths, filesystem=get_pyarrow_filesystem(fs)) schema = dataset.schema.to_arrow_schema() has_pandas_metadata = schema.metadata is not None and b'pandas' in schema.metadata task_name = 'read-parquet-' + tokenize(dataset, columns) if columns is None: all_columns = schema.names else: all_columns = columns if not isinstance(all_columns, list): out_type = Series all_columns = [all_columns] else: out_type = DataFrame if index is False: index_cols = [] elif index is None: if has_pandas_metadata: pandas_metadata = json.loads( schema.metadata[b'pandas'].decode('utf8')) index_cols = pandas_metadata.get('index_columns', []) else: index_cols = [] else: index_cols = index if isinstance(index, list) else [index] if index_cols: all_columns = list(unique(all_columns + index_cols)) dtypes = _get_pyarrow_dtypes(schema) meta = _meta_from_dtypes(all_columns, schema.names, dtypes, index_cols) if out_type == Series: assert len(meta.columns) == 1 meta = meta[meta.columns[0]] if dataset.pieces: divisions = (None, ) * (len(dataset.pieces) + 1) task_plan = {(task_name, i): (_read_pyarrow_parquet_piece, file_opener, piece, all_columns, index_cols, out_type == Series, dataset.partitions) for i, piece in enumerate(dataset.pieces)} else: divisions = (None, None) task_plan = {(task_name, 0): meta} return out_type(task_plan, task_name, meta, divisions)
def _determine_dataset_parts(fs, paths, gather_statistics, filters, dataset_kwargs): """ Determine how to access metadata and break read into ``parts`` This logic is mostly to handle `gather_statistics=False` cases, because this also means we should avoid scanning every file in the dataset. """ parts = [] if len(paths) > 1: if gather_statistics is not False: # This scans all the files dataset = pq.ParquetDataset(paths, filesystem=fs, filters=filters, **dataset_kwargs) else: base, fns = _analyze_paths(paths, fs) relpaths = [path.replace(base, "").lstrip("/") for path in paths] if "_metadata" in relpaths: # We have a _metadata file, lets use it dataset = pq.ParquetDataset( base + fs.sep + "_metadata", filesystem=fs, filters=filters, **dataset_kwargs, ) else: # Rely on metadata for 0th file. # Will need to pass a list of paths to read_partition dataset = pq.ParquetDataset(paths[0], filesystem=fs, **dataset_kwargs) parts = [base + fs.sep + fn for fn in fns] else: if fs.isdir(paths[0]): # This is a directory, check for _metadata, then _common_metadata allpaths = fs.glob(paths[0] + fs.sep + "*") base, fns = _analyze_paths(allpaths, fs) relpaths = [ path.replace(base, "").lstrip("/") for path in allpaths ] if "_metadata" in relpaths and "validate_schema" not in dataset_kwargs: dataset_kwargs["validate_schema"] = False if "_metadata" in relpaths or gather_statistics is not False: # Let arrow do its thing (use _metadata or scan files) dataset = pq.ParquetDataset(paths, filesystem=fs, filters=filters, **dataset_kwargs) else: # Use _common_metadata file if it is available. # Otherwise, just use 0th file if "_common_metadata" in relpaths: dataset = pq.ParquetDataset( base + fs.sep + "_common_metadata", filesystem=fs, **dataset_kwargs, ) else: dataset = pq.ParquetDataset(allpaths[0], filesystem=fs, **dataset_kwargs) parts = [base + fs.sep + fn for fn in fns] else: # There is only one file to read dataset = pq.ParquetDataset(paths, filesystem=fs, **dataset_kwargs) return parts, dataset
def initialize_write( df, fs, path, append=False, partition_on=None, ignore_divisions=False, division_info=None, **kwargs, ): dataset = fmd = None i_offset = 0 if append and division_info is None: ignore_divisions = True fs.mkdirs(path, exist_ok=True) if append: try: # Allow append if the dataset exists. # Also need dataset.metadata object if # ignore_divisions is False (to check divisions) dataset = pq.ParquetDataset(path, filesystem=fs) if not dataset.metadata and not ignore_divisions: # TODO: Be more flexible about existing metadata. raise NotImplementedError( "_metadata file needed to `append` " "with `engine='pyarrow'` " "unless `ignore_divisions` is `True`") fmd = dataset.metadata except (IOError, ValueError, IndexError): # Original dataset does not exist - cannot append append = False if append: names = dataset.metadata.schema.names has_pandas_metadata = ( dataset.schema.to_arrow_schema().metadata is not None and b"pandas" in dataset.schema.to_arrow_schema().metadata) if has_pandas_metadata: pandas_metadata = json.loads(dataset.schema.to_arrow_schema( ).metadata[b"pandas"].decode("utf8")) categories = [ c["name"] for c in pandas_metadata["columns"] if c["pandas_type"] == "categorical" ] else: categories = None dtypes = _get_pyarrow_dtypes(dataset.schema.to_arrow_schema(), categories) if set(names) != set(df.columns) - set(partition_on): raise ValueError("Appended columns not the same.\n" "Previous: {} | New: {}".format( names, list(df.columns))) elif (pd.Series(dtypes).loc[names] != df[names].dtypes).any(): # TODO Coerce values for compatible but different dtypes raise ValueError("Appended dtypes differ.\n{}".format( set(dtypes.items()) ^ set(df.dtypes.iteritems()))) i_offset = len(dataset.pieces) if division_info["name"] not in names: ignore_divisions = True if not ignore_divisions: old_end = None row_groups = [ dataset.metadata.row_group(i) for i in range(dataset.metadata.num_row_groups) ] for row_group in row_groups: for i, name in enumerate(names): if name != division_info["name"]: continue column = row_group.column(i) if column.statistics: if not old_end: old_end = column.statistics.max else: old_end = max(old_end, column.statistics.max) break divisions = division_info["divisions"] if divisions[0] < old_end: raise ValueError( "Appended divisions overlapping with the previous ones" " (set ignore_divisions=True to append anyway).\n" "Previous: {} | New: {}".format(old_end, divisions[0])) return fmd, i_offset
def get_dataset(key): s3 = s3fs.S3FileSystem(secret=ceph_secret, key=ceph_key, client_kwargs=client_kwargs) return pq.ParquetDataset(key, filesystem=s3).read_pandas().to_pandas()
def __init__(self, pyarrow_filesystem, dataset_path, schema_fields=None, shuffle_row_groups=True, shuffle_row_drop_partitions=1, predicate=None, rowgroup_selector=None, reader_pool=None, num_epochs=1, cur_shard=None, shard_count=None, cache=None, worker_class=None, transform_spec=None, is_batched_reader=False, filters=None, shard_seed=None): """Initializes a reader object. :param pyarrow_filesystem: An instance of ``pyarrow.FileSystem`` that will be used. If not specified, then a default one will be selected based on the url (only for ``hdfs://`` or ``file://``; for ``s3://`` and ``gs://`` support, use ``make_reader``). The default hdfs driver is ``libhdfs3``. If you want to to use ``libhdfs``, use ``pyarrow_filesystem=pyarrow.hdfs.connect('hdfs:///some/path', driver='libhdfs')``. :param dataset_path: filepath to a parquet directory or parquet file path list on the specified filesystem. e.g. ``'/user/yevgeni/parquet8'``, or ``'/tmp/mydataset'``, or ``[/tmp/mydataset/00000.parquet, /tmp/mydataset/00001.parquet]`` :param schema_fields: Either list of unischema fields to subset, or ``None`` to read all fields. OR an NGram object, then it will return an NGram of the specified properties. :param shuffle_row_groups: Whether to shuffle row groups (the order in which full row groups are read) :param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to break up a row group into for increased shuffling in exchange for worse performance (extra reads). For example if you specify 2 each row group read will drop half of the rows within every row group and read the remaining rows in separate reads. It is recommended to keep this number below the regular row group size in order to not waste reads which drop all rows. :param predicate: instance of predicate object to filter rows to be returned by reader. :param rowgroup_selector: instance of row group selector object to select row groups to be read :param reader_pool: parallelization pool. ``ThreadPool(10)`` (10 threads) is used by default. This pool is a custom implementation used to parallelize reading data from the dataset. Any object from workers_pool package can be used (e.g. :class:`petastorm.workers_pool.process_pool.ProcessPool`). :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to ``None`` will result in an infinite number of epochs. :param cur_shard: An int denoting the current shard number used. Each reader instance should pass in a unique shard number in the range ``[0, shard_count)``. ``shard_count`` must be supplied as well. Defaults to None :param shard_count: An int denoting the number of shard partitions there are. Defaults to None :param cache: An object conforming to :class:`.CacheBase` interface. Before loading row groups from a parquet file the Reader will attempt to load these values from cache. Caching is useful when communication to the main data store is either slow or expensive and the local machine has large enough storage to store entire dataset (or a partition of a dataset if shards are used). By default, use the :class:`.NullCache` implementation. :param worker_class: This is the class that will be instantiated on a different thread/process. It's responsibility is to load and filter the data. :param filters: (List[Tuple] or List[List[Tuple]]): Standard PyArrow filters. These will be applied when loading the parquet file with PyArrow. More information here: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html :param shard_seed: Random seed to shuffle row groups for data sharding. Defaults to None """ self.num_epochs = num_epochs # 1. Open the parquet storage (dataset) # 2. Get a list of all groups # 3. Filter rowgroups # a. predicates # b. row-group selector (our indexing mechanism) # c. partition: used to get a subset of data for distributed training # 4. Create a rowgroup ventilator object # 5. Start workers pool if not (isinstance(schema_fields, collections.Iterable) or isinstance(schema_fields, NGram) or schema_fields is None): raise ValueError( 'Fields must be either None, an iterable collection of Unischema fields ' 'or an NGram object.') self.is_batched_reader = is_batched_reader # 1. Resolve dataset path (hdfs://, file://) and open the parquet storage (dataset) self.dataset = pq.ParquetDataset(dataset_path, filesystem=pyarrow_filesystem, validate_schema=False, metadata_nthreads=10, filters=filters) if self.dataset.partitions is None: # When read from parquet file list, the `dataset.partitions` will be None. # But other petastorm code require at least an empty `ParquetPartitions` object. self.dataset.partitions = pq.ParquetPartitions() stored_schema = infer_or_load_unischema(self.dataset) if isinstance(schema_fields, NGram): self.ngram = schema_fields self.ngram.resolve_regex_field_names(stored_schema) else: self.ngram = None # By default, use original method of working with list of dictionaries and not arrow tables worker_class = worker_class or PyDictReaderWorker self._results_queue_reader = worker_class.new_results_queue_reader() if self.ngram and not self.ngram.timestamp_overlap and shuffle_row_drop_partitions > 1: raise NotImplementedError( 'Using timestamp_overlap=False is not implemented with' ' shuffle_options.shuffle_row_drop_partitions > 1') cache = cache or NullCache() self._workers_pool = reader_pool or ThreadPool(10) # Make a schema view (a view is a Unischema containing only a subset of fields # Will raise an exception if invalid schema fields are in schema_fields if self.ngram: fields = self.ngram.get_field_names_at_all_timesteps() else: fields = schema_fields if isinstance( schema_fields, collections.Iterable) else None storage_schema = stored_schema.create_schema_view( fields) if fields else stored_schema if len(storage_schema.fields) == 0: raise RuntimeError( f"No fields matching the criteria '{fields}' were found in the dataset {dataset_path}." ) if transform_spec: self.schema = transform_schema(storage_schema, transform_spec) else: self.schema = storage_schema # 2. Get a list of all row groups row_groups = dataset_metadata.load_row_groups(self.dataset) # 3. Filter rowgroups filtered_row_group_indexes, worker_predicate = self._filter_row_groups( self.dataset, row_groups, predicate, rowgroup_selector, cur_shard, shard_count, shard_seed) # 4. Create a rowgroup ventilator object normalized_shuffle_row_drop_partitions = \ self._normalize_shuffle_options(shuffle_row_drop_partitions, self.dataset) self.ventilator = self._create_ventilator( filtered_row_group_indexes, shuffle_row_groups, normalized_shuffle_row_drop_partitions, self.num_epochs, worker_predicate, self._workers_pool.workers_count + _VENTILATE_EXTRA_ROWGROUPS) # 5. Start workers pool self._workers_pool.start( worker_class, (pyarrow_filesystem, dataset_path, storage_schema, self.ngram, row_groups, cache, transform_spec, self.schema, filters), ventilator=self.ventilator) logger.debug('Workers pool started') self.last_row_consumed = False self.stopped = False
def read_parquet(path, engine: str = "auto", columns=None, groups_as_chunks=False, use_arrow_dtype=None, incremental_index=False, storage_options=None, memory_scale=None, **kwargs): """ Load a parquet object from the file path, returning a DataFrame. Parameters ---------- path : str, path object or file-like object Any valid string path is acceptable. The string could be a URL. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.parquet``. A file URL can also be a path to a directory that contains multiple partitioned parquet files. Both pyarrow and fastparquet support paths to directories as well as file URLs. A directory path could be: ``file://localhost/path/to/tables``. By file-like object, we refer to objects with a ``read()`` method, such as a file handler (e.g. via builtin ``open`` function) or ``StringIO``. engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' Parquet library to use. The default behavior is to try 'pyarrow', falling back to 'fastparquet' if 'pyarrow' is unavailable. columns : list, default=None If not None, only these columns will be read from the file. groups_as_chunks : bool, default False if True, each row group correspond to a chunk. if False, each file correspond to a chunk. Only available for 'pyarrow' engine. incremental_index: bool, default False Create a new RangeIndex if csv doesn't contain index columns. use_arrow_dtype: bool, default None If True, use arrow dtype to store columns. storage_options: dict, optional Options for storage connection. memory_scale: int, optional Scale that real memory occupation divided with raw file size. **kwargs Any additional kwargs are passed to the engine. Returns ------- Mars DataFrame """ engine_type = check_engine(engine) engine = get_engine(engine_type) if get_fs(path, storage_options).isdir(path): # If path is a directory, we will read as a partitioned datasets. if engine_type != 'pyarrow': raise TypeError('Only support pyarrow engine when reading from' 'partitioned datasets.') dataset = pq.ParquetDataset(path) dtypes = dataset.schema.to_arrow_schema().empty_table().to_pandas( ).dtypes for partition in dataset.partitions: dtypes[partition.name] = pd.CategoricalDtype() else: if not isinstance(path, list): file_path = glob(path, storage_options=storage_options)[0] else: file_path = path[0] with open_file(file_path, storage_options=storage_options) as f: dtypes = engine.read_dtypes(f) if columns: dtypes = dtypes[columns] if use_arrow_dtype is None: use_arrow_dtype = options.dataframe.use_arrow_dtype if use_arrow_dtype: dtypes = to_arrow_dtypes(dtypes) index_value = parse_index(pd.RangeIndex(-1)) columns_value = parse_index(dtypes.index, store_data=True) op = DataFrameReadParquet(path=path, engine=engine_type, columns=columns, groups_as_chunks=groups_as_chunks, use_arrow_dtype=use_arrow_dtype, read_kwargs=kwargs, incremental_index=incremental_index, storage_options=storage_options, memory_scale=memory_scale) return op(index_value=index_value, columns_value=columns_value, dtypes=dtypes)
def get_parquet_dataset(self, path): return pq.ParquetDataset(self.get_localized_path(path), filesystem=self.get_filesystem())
date_list.add((today - datetime.timedelta(days=32)).strftime('%Y-%m')) date_list = sorted( list( set([(today - datetime.timedelta(days=x)).strftime('%Y-%m') for x in range(32)]))) print date_list paths = [] for month in date_list: year, month = month.split('-') paths.append('bnroths/chicago-data/%s/year=%s/month=%s' % (dataset, year, month)) print paths # exit(0) for path in paths: ds = pq.ParquetDataset( path_or_paths=path, # 'bnroths/chicago-data/%s' % dataset, filesystem=S3FS, validate_schema=False) columns = datasets[dataset]['columns'] dt = columns[1] table = ds.read() df = table.to_pandas() print df.columns print df.head() df['dt'] = df[dt].astype(str).str[:7] cnts = datasets[dataset]['cnts'] dts = [] groups = dict(list(df.groupby('dt'))) print groups.keys() for group in groups:
def generate_petastorm_metadata(spark, dataset_url, unischema_class=None, use_summary_metadata=False, hdfs_driver='libhdfs3'): """ Generates metadata necessary to read a petastorm dataset to an existing dataset. :param spark: spark session :param dataset_url: url of existing dataset :param unischema_class: (optional) fully qualified dataset unischema class. If not specified will attempt to find one already in the dataset. (e.g. :class:`examples.hello_world.generate_hello_world_dataset.HelloWorldSchema`) :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are libhdfs (java through JNI) or libhdfs3 (C++) :param user: String denoting username when connecting to HDFS """ sc = spark.sparkContext resolver = FilesystemResolver(dataset_url, sc._jsc.hadoopConfiguration(), hdfs_driver=hdfs_driver, user=spark.sparkContext.sparkUser()) fs = resolver.filesystem() dataset = pq.ParquetDataset(resolver.get_dataset_path(), filesystem=fs, validate_schema=False) if unischema_class: schema = locate(unischema_class) if not isinstance(schema, Unischema): raise ValueError( 'The specified class %s is not an instance of a petastorm.Unischema object.', unischema_class) else: try: schema = get_schema(dataset) except ValueError: raise ValueError( 'Unischema class could not be located in existing dataset,' ' please specify it') # In order to be backwards compatible, we retrieve the common metadata from the dataset before # overwriting the metadata to keep row group indexes and the old row group per file index arrow_metadata = dataset.common_metadata or None with materialize_dataset(spark, dataset_url, schema, use_summary_metadata=use_summary_metadata, filesystem_factory=resolver.filesystem_factory()): if use_summary_metadata: # Inside the materialize dataset context we just need to write the metadata file as the schema will # be written by the context manager. # We use the java ParquetOutputCommitter to write the metadata file for the existing dataset # which will read all the footers of the dataset in parallel and merge them. hadoop_config = sc._jsc.hadoopConfiguration() Path = sc._gateway.jvm.org.apache.hadoop.fs.Path parquet_output_committer = sc._gateway.jvm.org.apache.parquet.hadoop.ParquetOutputCommitter parquet_output_committer.writeMetaDataFile(hadoop_config, Path(dataset_url)) spark.stop() if use_summary_metadata and arrow_metadata: # When calling writeMetaDataFile it will overwrite the _common_metadata file which could have schema information # or row group indexers. Therefore we want to retain this information and will add it to the new # _common_metadata file. If we were using the old legacy metadata method this file wont be deleted base_schema = arrow_metadata.schema.to_arrow_schema() metadata_dict = base_schema.metadata if ROW_GROUPS_PER_FILE_KEY in metadata_dict: add_to_dataset_metadata(dataset, ROW_GROUPS_PER_FILE_KEY, metadata_dict[ROW_GROUPS_PER_FILE_KEY]) if ROWGROUPS_INDEX_KEY in metadata_dict: add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY, metadata_dict[ROWGROUPS_INDEX_KEY])
def read_metadata( fs, paths, categories=None, index=None, gather_statistics=None, filters=None, **kwargs, ): dataset = pq.ParquetDataset(paths, filesystem=fs, **kwargs.get("dataset", {})) if dataset.partitions is not None: partitions = [ n for n in dataset.partitions.partition_names if n is not None ] else: partitions = [] schema = dataset.schema.to_arrow_schema() columns = None has_pandas_metadata = (schema.metadata is not None and b"pandas" in schema.metadata) if has_pandas_metadata: pandas_metadata = json.loads( schema.metadata[b"pandas"].decode("utf8")) ( index_names, column_names, storage_name_mapping, column_index_names, ) = _parse_pandas_metadata(pandas_metadata) else: index_names = [] column_names = schema.names storage_name_mapping = {k: k for k in column_names} column_index_names = [None] if index is None and index_names: index = index_names if set(column_names).intersection(partitions): raise ValueError("partition(s) should not exist in columns.\n" "categories: {} | partitions: {}".format( column_names, partitions)) column_names, index_names = _normalize_index_columns( columns, column_names + partitions, index, index_names) all_columns = index_names + column_names pieces = sorted(dataset.pieces, key=lambda piece: natural_sort_key(piece.path)) # Check that categories are included in columns if categories and not set(categories).intersection(all_columns): raise ValueError("categories not in available columns.\n" "categories: {} | columns: {}".format( categories, list(all_columns))) dtypes = _get_pyarrow_dtypes(schema, categories) dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()} index_cols = index or () meta = _meta_from_dtypes(all_columns, dtypes, index_cols, column_index_names) meta = clear_known_categories(meta, cols=categories) if (gather_statistics is None and dataset.metadata and dataset.metadata.num_row_groups == len(pieces)): gather_statistics = True if not pieces: gather_statistics = False if gather_statistics: # Read from _metadata file if dataset.metadata and dataset.metadata.num_row_groups == len( pieces): row_groups = [ dataset.metadata.row_group(i) for i in range(dataset.metadata.num_row_groups) ] names = dataset.metadata.schema.names else: # Read from each individual piece (quite possibly slow). row_groups = _get_md_row_groups(pieces) if row_groups: piece = pieces[0] md = piece.get_metadata() names = md.schema.names else: gather_statistics = False if gather_statistics: stats = [] for row_group in row_groups: s = {"num-rows": row_group.num_rows, "columns": []} for i, name in enumerate(names): column = row_group.column(i) d = {"name": name} if column.statistics: cs_min = column.statistics.min cs_max = column.statistics.max d.update({ "min": cs_min, "max": cs_max, "null_count": column.statistics.null_count, }) s["columns"].append(d) stats.append(s) else: stats = None if dataset.partitions: for partition in dataset.partitions: if isinstance(index, list) and partition.name == index[0]: meta.index = pd.CategoricalIndex(categories=partition.keys, name=index[0]) elif partition.name == meta.index.name: meta.index = pd.CategoricalIndex(categories=partition.keys, name=meta.index.name) elif partition.name in meta.columns: meta[partition.name] = pd.Categorical( categories=partition.keys, values=[]) # Create `parts` (list of row-group-descriptor dicts) parts = [{ "piece": piece, "kwargs": { "partitions": dataset.partitions, "categories": categories }, } for piece in pieces] return (meta, stats, parts)
def materialize_dataset(spark, dataset_url, schema, row_group_size_mb=None, use_summary_metadata=False, filesystem_factory=None): """ A Context Manager which handles all the initialization and finalization necessary to generate metadata for a petastorm dataset. This should be used around your spark logic to materialize a dataset (specifically the writing of parquet output). Note: Any rowgroup indexing should happen outside the materialize_dataset block Example: >>> spark = SparkSession.builder... >>> ds_url = 'hdfs:///path/to/my/dataset' >>> with materialize_dataset(spark, ds_url, MyUnischema, 64): >>> spark.sparkContext.parallelize(range(0, 10)). >>> ... >>> .write.parquet(ds_url) >>> indexer = [SingleFieldIndexer(...)] >>> build_rowgroup_index(ds_url, spark.sparkContext, indexer) A user may provide their own recipe for creation of pyarrow filesystem object in ``filesystem_factory`` argument (otherwise, petastorm will create a default one based on the url). The following example shows how a custom pyarrow HDFS filesystem, instantiated using ``libhdfs`` driver can be used during Petastorm dataset generation: >>> resolver=FilesystemResolver(dataset_url, spark.sparkContext._jsc.hadoopConfiguration(), >>> hdfs_driver='libhdfs') >>> with materialize_dataset(..., filesystem_factory=resolver.filesystem_factory()): >>> ... :param spark: The spark session you are using :param dataset_url: The dataset url to output your dataset to (e.g. ``hdfs:///path/to/dataset``) :param schema: The :class:`petastorm.unischema.Unischema` definition of your dataset :param row_group_size_mb: The parquet row group size to use for your dataset :param use_summary_metadata: Whether to use the parquet summary metadata for row group indexing or a custom indexing method. The custom indexing method is more scalable for very large datasets. :param filesystem_factory: A filesystem factory function to be used when saving Petastorm specific metadata to the Parquet store. """ spark_config = {} _init_spark(spark, spark_config, row_group_size_mb, use_summary_metadata) yield # After job completes, add the unischema metadata and check for the metadata summary file if filesystem_factory is None: resolver = FilesystemResolver( dataset_url, spark.sparkContext._jsc.hadoopConfiguration(), user=spark.sparkContext.sparkUser()) filesystem_factory = resolver.filesystem_factory() dataset_path = resolver.get_dataset_path() else: dataset_path = get_dataset_path(urlparse(dataset_url)) filesystem = filesystem_factory() dataset = pq.ParquetDataset(dataset_path, filesystem=filesystem, validate_schema=False) _generate_unischema_metadata(dataset, schema) if not use_summary_metadata: _generate_num_row_groups_per_file(dataset, spark.sparkContext, filesystem_factory) # Reload the dataset to take into account the new metadata dataset = pq.ParquetDataset(dataset_path, filesystem=filesystem, validate_schema=False) try: # Try to load the row groups, if it fails that means the metadata was not generated properly load_row_groups(dataset) except PetastormMetadataError: raise PetastormMetadataGenerationError( 'Could not find summary metadata file. The dataset will exist but you will need' ' to execute petastorm-generate-metadata.py before you can read your dataset ' ' in order to generate the necessary metadata.' ' Try increasing spark driver memory next time and making sure you are' ' using parquet-mr >= 1.8.3') _cleanup_spark(spark, spark_config, row_group_size_mb)
def prepare_read( self, parallelism: int, paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, columns: Optional[List[str]] = None, schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, meta_provider: ParquetMetadataProvider = DefaultParquetMetadataProvider(), _block_udf: Optional[Callable[[Block], Block]] = None, **reader_args, ) -> List[ReadTask]: """Creates and returns read tasks for a Parquet file-based datasource.""" # NOTE: We override the base class FileBasedDatasource.prepare_read # method in order to leverage pyarrow's ParquetDataset abstraction, # which simplifies partitioning logic. We still use # FileBasedDatasource's write side (do_write), however. _check_pyarrow_version() from ray import cloudpickle import pyarrow as pa import pyarrow.parquet as pq import numpy as np paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) if len(paths) == 1: paths = paths[0] dataset_kwargs = reader_args.pop("dataset_kwargs", {}) pq_ds = pq.ParquetDataset( paths, **dataset_kwargs, filesystem=filesystem, use_legacy_dataset=False ) if schema is None: schema = pq_ds.schema if columns: schema = pa.schema( [schema.field(column) for column in columns], schema.metadata ) def read_pieces(serialized_pieces: str) -> Iterator[pa.Table]: # Implicitly trigger S3 subsystem initialization by importing # pyarrow.fs. import pyarrow.fs # noqa: F401 # Deserialize after loading the filesystem class. try: _register_parquet_file_fragment_serialization() pieces: List[ "pyarrow._dataset.ParquetFileFragment" ] = cloudpickle.loads(serialized_pieces) finally: _deregister_parquet_file_fragment_serialization() # Ensure that we're reading at least one dataset fragment. assert len(pieces) > 0 from pyarrow.dataset import _get_partition_keys ctx = DatasetContext.get_current() output_buffer = BlockOutputBuffer( block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size ) logger.debug(f"Reading {len(pieces)} parquet pieces") use_threads = reader_args.pop("use_threads", False) for piece in pieces: part = _get_partition_keys(piece.partition_expression) batches = piece.to_batches( use_threads=use_threads, columns=columns, schema=schema, batch_size=PARQUET_READER_ROW_BATCH_SIZE, **reader_args, ) for batch in batches: table = pyarrow.Table.from_batches([batch], schema=schema) if part: for col, value in part.items(): table = table.set_column( table.schema.get_field_index(col), col, pa.array([value] * len(table)), ) # If the table is empty, drop it. if table.num_rows > 0: output_buffer.add_block(table) if output_buffer.has_next(): yield output_buffer.next() output_buffer.finalize() if output_buffer.has_next(): yield output_buffer.next() if _block_udf is not None: # Try to infer dataset schema by passing dummy table through UDF. dummy_table = schema.empty_table() try: inferred_schema = _block_udf(dummy_table).schema inferred_schema = inferred_schema.with_metadata(schema.metadata) except Exception: logger.debug( "Failed to infer schema of dataset by passing dummy table " "through UDF due to the following exception:", exc_info=True, ) inferred_schema = schema else: inferred_schema = schema read_tasks = [] metadata = meta_provider.prefetch_file_metadata(pq_ds.pieces) or [] try: _register_parquet_file_fragment_serialization() for pieces, metadata in zip( np.array_split(pq_ds.pieces, parallelism), np.array_split(metadata, parallelism), ): if len(pieces) <= 0: continue serialized_pieces = cloudpickle.dumps(pieces) input_files = [p.path for p in pieces] meta = meta_provider( input_files, inferred_schema, pieces=pieces, prefetched_metadata=metadata, ) read_tasks.append( ReadTask(lambda p=serialized_pieces: read_pieces(p), meta) ) finally: _deregister_parquet_file_fragment_serialization() return read_tasks