def read_parquet(self, path, columns=None, metadata=None, schema=None, use_threads=True, use_pandas_metadata=False): """ Read Parquet data from path in file system. Can read from a single file or a directory of files Parameters ---------- path : str Single file path or directory columns : List[str], optional Subset of columns to read metadata : pyarrow.parquet.FileMetaData Known metadata to validate files against schema : pyarrow.parquet.Schema Known schema to validate files against. Alternative to metadata argument use_threads : boolean, default True Perform multi-threaded column reads use_pandas_metadata : boolean, default False If True and file has custom pandas schema metadata, ensure that index columns are also loaded Returns ------- table : pyarrow.Table """ from pyarrow.parquet import ParquetDataset dataset = ParquetDataset(path, schema=schema, metadata=metadata, filesystem=self) return dataset.read(columns=columns, use_threads=use_threads, use_pandas_metadata=use_pandas_metadata)
def read_parquet(self, path, columns=None, metadata=None, schema=None, nthreads=1): """ Read Parquet data from path in file system. Can read from a single file or a directory of files Parameters ---------- path : str Single file path or directory columns : List[str], optional Subset of columns to read metadata : pyarrow.parquet.FileMetaData Known metadata to validate files against schema : pyarrow.parquet.Schema Known schema to validate files against. Alternative to metadata argument nthreads : int, default 1 Number of columns to read in parallel. If > 1, requires that the underlying file source is threadsafe Returns ------- table : pyarrow.Table """ from pyarrow.parquet import ParquetDataset dataset = ParquetDataset(path, schema=schema, metadata=metadata, filesystem=self) return dataset.read(columns=columns, nthreads=nthreads)
def read_parquet(self, path, columns=None, metadata=None, schema=None, nthreads=1, use_pandas_metadata=False): """ Read Parquet data from path in file system. Can read from a single file or a directory of files Parameters ---------- path : str Single file path or directory columns : List[str], optional Subset of columns to read metadata : pyarrow.parquet.FileMetaData Known metadata to validate files against schema : pyarrow.parquet.Schema Known schema to validate files against. Alternative to metadata argument nthreads : int, default 1 Number of columns to read in parallel. If > 1, requires that the underlying file source is threadsafe use_pandas_metadata : boolean, default False If True and file has custom pandas schema metadata, ensure that index columns are also loaded Returns ------- table : pyarrow.Table """ from pyarrow.parquet import ParquetDataset dataset = ParquetDataset(path, schema=schema, metadata=metadata, filesystem=self) return dataset.read(columns=columns, nthreads=nthreads, use_pandas_metadata=use_pandas_metadata)
def _read_parquet_arrow(self, hash_list): from pyarrow.parquet import ParquetDataset objfiles = [self._store.object_path(h) for h in hash_list] dataset = ParquetDataset(objfiles) table = dataset.read(nthreads=4) df = table.to_pandas() return df
def _read_parquet_arrow(self, hash_list): from pyarrow.parquet import ParquetDataset objfiles = [self.object_path(h) for h in hash_list] dataset = ParquetDataset(objfiles) try: table = dataset.read(use_threads=True) # pyarrow == 0.11 except TypeError: table = dataset.read(nthreads=4) # pyarrow < 0.11 try: dataframe = table.to_pandas() except Exception: # Try again to convert the table after removing # the possibly buggy Pandas-specific metadata. meta = table.schema.metadata.copy() meta.pop(b'pandas') newtable = table.replace_schema_metadata(meta) dataframe = newtable.to_pandas() return dataframe
async def test_local_arrow_storage_provider( tmp_path: Path, test_values: dt_test_values) -> None: test_table, visit_ids = test_values structured_provider = LocalArrowProvider(tmp_path) await structured_provider.init() for table_name, test_data in test_table.items(): await structured_provider.store_record(TableName(table_name), test_data["visit_id"], test_data) token_list = [] for i in visit_ids: token_list.append(await structured_provider.finalize_visit_id(i)) await structured_provider.flush_cache() await asyncio.gather(*token_list) for table_name, test_data in test_table.items(): dataset = ParquetDataset(tmp_path / table_name) df: DataFrame = dataset.read().to_pandas() assert df.shape[0] == 1 for row in df.itertuples(index=False): if test_data["visit_id"] == INVALID_VISIT_ID: del test_data["visit_id"] assert row._asdict() == test_data
def _build_node(build_dir, package, node_path, node, checks_contents=None, dry_run=False, env='default', ancestor_args={}): """ Parameters ---------- ancestor_args : dict any transform inherited from an ancestor plus any inherited handler kwargs Users can thus define kwargs that affect entire subtrees (e.g. transform: csv for 500 .txt files) and overriding of ancestor or peer values. Child transform or kwargs override ancestor k:v pairs. """ if _is_internal_node(node): if not dry_run: package.save_group(node_path, None) # Make a consumable copy. This is to cover a quirk introduced by accepting nodes named # like RESERVED keys -- if a RESERVED key is actually matched, it should be removed from # the node, or it gets treated like a subnode (or like a node with invalid content) node = node.copy() # NOTE: YAML parsing does not guarantee key order # fetch local transform and kwargs values; we do it using ifs # to prevent `key: None` from polluting the update local_args = _get_local_args( node, [RESERVED['transform'], RESERVED['kwargs']]) group_args = ancestor_args.copy() group_args.update(local_args) _consume(node, local_args) # if it's not a reserved word it's a group that we can descend groups = {k: v for k, v in iteritems(node) if _is_valid_group(v)} _consume(node, groups) if node: # Unused keys -- either keyword typos or node names with invalid values. # For now, until build.yml schemas, pointing out one should do. key, value = node.popitem() raise BuildException( "Invalid syntax: expected node data for {!r}, got {!r}".format( key, value)) for child_name, child_table in groups.items(): if glob.has_magic(child_name): # child_name is a glob string, use it to generate multiple child nodes for gchild_name, gchild_table in _gen_glob_data( build_dir, child_name, child_table): _build_node(build_dir, package, node_path + [gchild_name], gchild_table, checks_contents=checks_contents, dry_run=dry_run, env=env, ancestor_args=group_args) else: if not isinstance(child_name, str) or not is_nodename(child_name): raise StoreException("Invalid node name: %r" % child_name) _build_node(build_dir, package, node_path + [child_name], child_table, checks_contents=checks_contents, dry_run=dry_run, env=env, ancestor_args=group_args) else: # leaf node # prevent overwriting existing node names if '/'.join(node_path) in package: raise BuildException( "Naming conflict: {!r} added to package more than once".format( '/'.join(node_path))) # handle group leaf nodes (empty groups) if not node: if not dry_run: package.save_group(node_path, None) return include_package = node.get(RESERVED['package']) rel_path = node.get(RESERVED['file']) if rel_path and include_package: raise BuildException( "A node must define only one of {0} or {1}".format( RESERVED['file'], RESERVED['package'])) elif include_package: # package composition team, user, pkgname, subpath = parse_package(include_package, allow_subpath=True) existing_pkg = PackageStore.find_package(team, user, pkgname) if existing_pkg is None: raise BuildException("Package not found: %s" % include_package) if subpath: try: node = existing_pkg["/".join(subpath)] except KeyError: msg = "Package {team}:{owner}/{pkg} has no subpackage: {subpath}" raise BuildException( msg.format(team=team, owner=user, pkg=pkgname, subpath=subpath)) else: node = GroupNode(existing_pkg.get_contents().children) package.save_package_tree(node_path, node) elif rel_path: # handle nodes built from input files path = os.path.join(build_dir, rel_path) rel_meta_path = node.get(RESERVED['meta']) if rel_meta_path: with open(os.path.join(build_dir, rel_meta_path)) as fd: try: metadata = json.load(fd) except ValueError as ex: raise BuildException("Failed to parse %r as JSON: %s" % (rel_meta_path, ex)) if SYSTEM_METADATA in metadata: raise BuildException( "Invalid metadata in %r: not allowed to use key %r" % (rel_meta_path, SYSTEM_METADATA)) else: metadata = None # get either the locally defined transform and target or inherit from an ancestor transform = node.get(RESERVED['transform']) or ancestor_args.get( RESERVED['transform']) ID = 'id' # pylint:disable=C0103 PARQUET = 'parquet' # pylint:disable=C0103 if transform: transform = transform.lower() if transform in PANDAS_PARSERS: target = TargetType.PANDAS elif transform == PARQUET: target = TargetType.PANDAS elif transform == ID: target = TargetType.FILE else: raise BuildException("Unknown transform '%s' for %s" % (transform, rel_path)) else: # Guess transform and target based on file extension if not provided _, ext = splitext_no_dot(rel_path) if ext in PANDAS_PARSERS: transform = ext target = TargetType.PANDAS elif ext == PARQUET: transform = ext target = TargetType.PANDAS else: transform = ID target = TargetType.FILE print("Inferring 'transform: %s' for %s" % (transform, rel_path)) # TODO: parse/check environments: # environments = node.get(RESERVED['environments']) checks = node.get(RESERVED['checks']) if transform == ID: #TODO move this to a separate function if checks: with open(path, 'r') as fd: data = fd.read() _run_checks(data, checks, checks_contents, node_path, rel_path, target, env=env) if not dry_run: print("Registering %s..." % path) package.save_file(path, node_path, target, rel_path, transform, metadata) elif transform == PARQUET: if checks: from pyarrow.parquet import ParquetDataset dataset = ParquetDataset(path) table = dataset.read(nthreads=4) dataframe = table.to_pandas() _run_checks(dataframe, checks, checks_contents, node_path, rel_path, target, env=env) if not dry_run: print("Registering %s..." % path) package.save_file(path, node_path, target, rel_path, transform, metadata) else: # copy so we don't modify shared ancestor_args handler_args = dict(ancestor_args.get(RESERVED['kwargs'], {})) # local kwargs win the update handler_args.update(node.get(RESERVED['kwargs'], {})) # Check Cache store = PackageStore() path_hash = _path_hash(path, transform, handler_args) source_hash = digest_file(path) cachedobjs = [] if os.path.exists(store.cache_path(path_hash)): with open(store.cache_path(path_hash), 'r') as entry: cache_entry = json.load(entry) if cache_entry['source_hash'] == source_hash: cachedobjs = cache_entry['obj_hashes'] assert isinstance(cachedobjs, list) # TODO: check for changes in checks else use cache # below is a heavy-handed fix but it's OK for check builds to be slow if not checks and cachedobjs and all( os.path.exists(store.object_path(obj)) for obj in cachedobjs): # Use existing objects instead of rebuilding package.save_cached_df(cachedobjs, node_path, target, rel_path, transform, metadata) else: # read source file into DataFrame print("Serializing %s..." % path) if _have_pyspark(): dataframe = _file_to_spark_data_frame( transform, path, handler_args) else: dataframe = _file_to_data_frame( transform, path, handler_args) if checks: # TODO: test that design works for internal nodes... e.g. iterating # over the children and getting/checking the data, err msgs, etc. _run_checks(dataframe, checks, checks_contents, node_path, rel_path, target, env=env) # serialize DataFrame to file(s) if not dry_run: print("Saving as binary dataframe...") obj_hashes = package.save_df(dataframe, node_path, target, rel_path, transform, metadata) # Add to cache cache_entry = dict(source_hash=source_hash, obj_hashes=obj_hashes) with open(store.cache_path(path_hash), 'w') as entry: json.dump(cache_entry, entry) else: # rel_path and package are both None raise BuildException( "Leaf nodes must define either a %s or %s key" % (RESERVED['file'], RESERVED['package']))