Exemplo n.º 1
0
        def read_parquet(self, path, columns=None, metadata=None, schema=None,
                         use_threads=True, use_pandas_metadata=False):
            """
            Read Parquet data from path in file system. Can read from a single file
            or a directory of files

            Parameters
            ----------
            path : str
                Single file path or directory
            columns : List[str], optional
                Subset of columns to read
            metadata : pyarrow.parquet.FileMetaData
                Known metadata to validate files against
            schema : pyarrow.parquet.Schema
                Known schema to validate files against. Alternative to metadata
                argument
            use_threads : boolean, default True
                Perform multi-threaded column reads
            use_pandas_metadata : boolean, default False
                If True and file has custom pandas schema metadata, ensure that
                index columns are also loaded

            Returns
            -------
            table : pyarrow.Table
            """
            from pyarrow.parquet import ParquetDataset
            dataset = ParquetDataset(path, schema=schema, metadata=metadata,
                                     filesystem=self)
            return dataset.read(columns=columns, use_threads=use_threads,
                                use_pandas_metadata=use_pandas_metadata)
Exemplo n.º 2
0
    def read_parquet(self,
                     path,
                     columns=None,
                     metadata=None,
                     schema=None,
                     nthreads=1):
        """
        Read Parquet data from path in file system. Can read from a single file
        or a directory of files

        Parameters
        ----------
        path : str
            Single file path or directory
        columns : List[str], optional
            Subset of columns to read
        metadata : pyarrow.parquet.FileMetaData
            Known metadata to validate files against
        schema : pyarrow.parquet.Schema
            Known schema to validate files against. Alternative to metadata
            argument
        nthreads : int, default 1
            Number of columns to read in parallel. If > 1, requires that the
            underlying file source is threadsafe

        Returns
        -------
        table : pyarrow.Table
        """
        from pyarrow.parquet import ParquetDataset
        dataset = ParquetDataset(path,
                                 schema=schema,
                                 metadata=metadata,
                                 filesystem=self)
        return dataset.read(columns=columns, nthreads=nthreads)
Exemplo n.º 3
0
    def read_parquet(self, path, columns=None, metadata=None, schema=None,
                     nthreads=1, use_pandas_metadata=False):
        """
        Read Parquet data from path in file system. Can read from a single file
        or a directory of files

        Parameters
        ----------
        path : str
            Single file path or directory
        columns : List[str], optional
            Subset of columns to read
        metadata : pyarrow.parquet.FileMetaData
            Known metadata to validate files against
        schema : pyarrow.parquet.Schema
            Known schema to validate files against. Alternative to metadata
            argument
        nthreads : int, default 1
            Number of columns to read in parallel. If > 1, requires that the
            underlying file source is threadsafe
        use_pandas_metadata : boolean, default False
            If True and file has custom pandas schema metadata, ensure that
            index columns are also loaded

        Returns
        -------
        table : pyarrow.Table
        """
        from pyarrow.parquet import ParquetDataset
        dataset = ParquetDataset(path, schema=schema, metadata=metadata,
                                 filesystem=self)
        return dataset.read(columns=columns, nthreads=nthreads,
                            use_pandas_metadata=use_pandas_metadata)
Exemplo n.º 4
0
    def read_parquet(self, path, columns=None, metadata=None, schema=None,
                     use_threads=True, use_pandas_metadata=False):
        """
        Read Parquet data from path in file system. Can read from a single file
        or a directory of files

        Parameters
        ----------
        path : str
            Single file path or directory
        columns : List[str], optional
            Subset of columns to read
        metadata : pyarrow.parquet.FileMetaData
            Known metadata to validate files against
        schema : pyarrow.parquet.Schema
            Known schema to validate files against. Alternative to metadata
            argument
        use_threads : boolean, default True
            Perform multi-threaded column reads
        use_pandas_metadata : boolean, default False
            If True and file has custom pandas schema metadata, ensure that
            index columns are also loaded

        Returns
        -------
        table : pyarrow.Table
        """
        from pyarrow.parquet import ParquetDataset
        dataset = ParquetDataset(path, schema=schema, metadata=metadata,
                                 filesystem=self)
        return dataset.read(columns=columns, use_threads=use_threads,
                            use_pandas_metadata=use_pandas_metadata)
Exemplo n.º 5
0
    def _read_parquet_arrow(self, hash_list):
        from pyarrow.parquet import ParquetDataset

        objfiles = [self._store.object_path(h) for h in hash_list]
        dataset = ParquetDataset(objfiles)
        table = dataset.read(nthreads=4)
        df = table.to_pandas()
        return df
Exemplo n.º 6
0
    def _read_parquet_arrow(self, hash_list):
        from pyarrow.parquet import ParquetDataset

        objfiles = [self.object_path(h) for h in hash_list]
        dataset = ParquetDataset(objfiles)
        try:
            table = dataset.read(use_threads=True)  # pyarrow == 0.11
        except TypeError:
            table = dataset.read(nthreads=4)  # pyarrow < 0.11
        try:
            dataframe = table.to_pandas()
        except Exception:
            # Try again to convert the table after removing
            # the possibly buggy Pandas-specific metadata.
            meta = table.schema.metadata.copy()
            meta.pop(b'pandas')
            newtable = table.replace_schema_metadata(meta)
            dataframe = newtable.to_pandas()

        return dataframe
async def test_local_arrow_storage_provider(
        tmp_path: Path, test_values: dt_test_values) -> None:
    test_table, visit_ids = test_values
    structured_provider = LocalArrowProvider(tmp_path)
    await structured_provider.init()
    for table_name, test_data in test_table.items():
        await structured_provider.store_record(TableName(table_name),
                                               test_data["visit_id"],
                                               test_data)
    token_list = []
    for i in visit_ids:
        token_list.append(await structured_provider.finalize_visit_id(i))
    await structured_provider.flush_cache()
    await asyncio.gather(*token_list)
    for table_name, test_data in test_table.items():
        dataset = ParquetDataset(tmp_path / table_name)
        df: DataFrame = dataset.read().to_pandas()
        assert df.shape[0] == 1
        for row in df.itertuples(index=False):
            if test_data["visit_id"] == INVALID_VISIT_ID:
                del test_data["visit_id"]
            assert row._asdict() == test_data
Exemplo n.º 8
0
def _build_node(build_dir,
                package,
                node_path,
                node,
                checks_contents=None,
                dry_run=False,
                env='default',
                ancestor_args={}):
    """
    Parameters
    ----------
    ancestor_args : dict
      any transform inherited from an ancestor
      plus any inherited handler kwargs
      Users can thus define kwargs that affect entire subtrees
      (e.g. transform: csv for 500 .txt files)
      and overriding of ancestor or peer values.
      Child transform or kwargs override ancestor k:v pairs.
    """
    if _is_internal_node(node):
        if not dry_run:
            package.save_group(node_path, None)

        # Make a consumable copy.  This is to cover a quirk introduced by accepting nodes named
        # like RESERVED keys -- if a RESERVED key is actually matched, it should be removed from
        # the node, or it gets treated like a subnode (or like a node with invalid content)
        node = node.copy()

        # NOTE: YAML parsing does not guarantee key order
        # fetch local transform and kwargs values; we do it using ifs
        # to prevent `key: None` from polluting the update
        local_args = _get_local_args(
            node, [RESERVED['transform'], RESERVED['kwargs']])
        group_args = ancestor_args.copy()
        group_args.update(local_args)
        _consume(node, local_args)

        # if it's not a reserved word it's a group that we can descend
        groups = {k: v for k, v in iteritems(node) if _is_valid_group(v)}
        _consume(node, groups)

        if node:
            # Unused keys -- either keyword typos or node names with invalid values.
            #   For now, until build.yml schemas, pointing out one should do.
            key, value = node.popitem()
            raise BuildException(
                "Invalid syntax: expected node data for {!r}, got {!r}".format(
                    key, value))
        for child_name, child_table in groups.items():
            if glob.has_magic(child_name):
                # child_name is a glob string, use it to generate multiple child nodes
                for gchild_name, gchild_table in _gen_glob_data(
                        build_dir, child_name, child_table):
                    _build_node(build_dir,
                                package,
                                node_path + [gchild_name],
                                gchild_table,
                                checks_contents=checks_contents,
                                dry_run=dry_run,
                                env=env,
                                ancestor_args=group_args)
            else:
                if not isinstance(child_name,
                                  str) or not is_nodename(child_name):
                    raise StoreException("Invalid node name: %r" % child_name)
                _build_node(build_dir,
                            package,
                            node_path + [child_name],
                            child_table,
                            checks_contents=checks_contents,
                            dry_run=dry_run,
                            env=env,
                            ancestor_args=group_args)
    else:  # leaf node
        # prevent overwriting existing node names
        if '/'.join(node_path) in package:
            raise BuildException(
                "Naming conflict: {!r} added to package more than once".format(
                    '/'.join(node_path)))
        # handle group leaf nodes (empty groups)
        if not node:
            if not dry_run:
                package.save_group(node_path, None)
            return

        include_package = node.get(RESERVED['package'])
        rel_path = node.get(RESERVED['file'])
        if rel_path and include_package:
            raise BuildException(
                "A node must define only one of {0} or {1}".format(
                    RESERVED['file'], RESERVED['package']))
        elif include_package:  # package composition
            team, user, pkgname, subpath = parse_package(include_package,
                                                         allow_subpath=True)
            existing_pkg = PackageStore.find_package(team, user, pkgname)
            if existing_pkg is None:
                raise BuildException("Package not found: %s" % include_package)

            if subpath:
                try:
                    node = existing_pkg["/".join(subpath)]
                except KeyError:
                    msg = "Package {team}:{owner}/{pkg} has no subpackage: {subpath}"
                    raise BuildException(
                        msg.format(team=team,
                                   owner=user,
                                   pkg=pkgname,
                                   subpath=subpath))
            else:
                node = GroupNode(existing_pkg.get_contents().children)
            package.save_package_tree(node_path, node)
        elif rel_path:  # handle nodes built from input files
            path = os.path.join(build_dir, rel_path)

            rel_meta_path = node.get(RESERVED['meta'])
            if rel_meta_path:
                with open(os.path.join(build_dir, rel_meta_path)) as fd:
                    try:
                        metadata = json.load(fd)
                    except ValueError as ex:
                        raise BuildException("Failed to parse %r as JSON: %s" %
                                             (rel_meta_path, ex))
                    if SYSTEM_METADATA in metadata:
                        raise BuildException(
                            "Invalid metadata in %r: not allowed to use key %r"
                            % (rel_meta_path, SYSTEM_METADATA))
            else:
                metadata = None

            # get either the locally defined transform and target or inherit from an ancestor
            transform = node.get(RESERVED['transform']) or ancestor_args.get(
                RESERVED['transform'])

            ID = 'id'  # pylint:disable=C0103
            PARQUET = 'parquet'  # pylint:disable=C0103
            if transform:
                transform = transform.lower()
                if transform in PANDAS_PARSERS:
                    target = TargetType.PANDAS
                elif transform == PARQUET:
                    target = TargetType.PANDAS
                elif transform == ID:
                    target = TargetType.FILE
                else:
                    raise BuildException("Unknown transform '%s' for %s" %
                                         (transform, rel_path))
            else:
                # Guess transform and target based on file extension if not provided
                _, ext = splitext_no_dot(rel_path)

                if ext in PANDAS_PARSERS:
                    transform = ext
                    target = TargetType.PANDAS
                elif ext == PARQUET:
                    transform = ext
                    target = TargetType.PANDAS
                else:
                    transform = ID
                    target = TargetType.FILE
                print("Inferring 'transform: %s' for %s" %
                      (transform, rel_path))

            # TODO: parse/check environments:
            # environments = node.get(RESERVED['environments'])
            checks = node.get(RESERVED['checks'])
            if transform == ID:
                #TODO move this to a separate function
                if checks:
                    with open(path, 'r') as fd:
                        data = fd.read()
                        _run_checks(data,
                                    checks,
                                    checks_contents,
                                    node_path,
                                    rel_path,
                                    target,
                                    env=env)
                if not dry_run:
                    print("Registering %s..." % path)
                    package.save_file(path, node_path, target, rel_path,
                                      transform, metadata)
            elif transform == PARQUET:
                if checks:
                    from pyarrow.parquet import ParquetDataset
                    dataset = ParquetDataset(path)
                    table = dataset.read(nthreads=4)
                    dataframe = table.to_pandas()
                    _run_checks(dataframe,
                                checks,
                                checks_contents,
                                node_path,
                                rel_path,
                                target,
                                env=env)
                if not dry_run:
                    print("Registering %s..." % path)
                    package.save_file(path, node_path, target, rel_path,
                                      transform, metadata)
            else:
                # copy so we don't modify shared ancestor_args
                handler_args = dict(ancestor_args.get(RESERVED['kwargs'], {}))
                # local kwargs win the update
                handler_args.update(node.get(RESERVED['kwargs'], {}))
                # Check Cache
                store = PackageStore()
                path_hash = _path_hash(path, transform, handler_args)
                source_hash = digest_file(path)

                cachedobjs = []
                if os.path.exists(store.cache_path(path_hash)):
                    with open(store.cache_path(path_hash), 'r') as entry:
                        cache_entry = json.load(entry)
                        if cache_entry['source_hash'] == source_hash:
                            cachedobjs = cache_entry['obj_hashes']
                            assert isinstance(cachedobjs, list)

                # TODO: check for changes in checks else use cache
                # below is a heavy-handed fix but it's OK for check builds to be slow
                if not checks and cachedobjs and all(
                        os.path.exists(store.object_path(obj))
                        for obj in cachedobjs):
                    # Use existing objects instead of rebuilding
                    package.save_cached_df(cachedobjs, node_path, target,
                                           rel_path, transform, metadata)
                else:
                    # read source file into DataFrame
                    print("Serializing %s..." % path)
                    if _have_pyspark():
                        dataframe = _file_to_spark_data_frame(
                            transform, path, handler_args)
                    else:
                        dataframe = _file_to_data_frame(
                            transform, path, handler_args)

                    if checks:
                        # TODO: test that design works for internal nodes... e.g. iterating
                        # over the children and getting/checking the data, err msgs, etc.
                        _run_checks(dataframe,
                                    checks,
                                    checks_contents,
                                    node_path,
                                    rel_path,
                                    target,
                                    env=env)

                    # serialize DataFrame to file(s)
                    if not dry_run:
                        print("Saving as binary dataframe...")
                        obj_hashes = package.save_df(dataframe, node_path,
                                                     target, rel_path,
                                                     transform, metadata)

                        # Add to cache
                        cache_entry = dict(source_hash=source_hash,
                                           obj_hashes=obj_hashes)
                        with open(store.cache_path(path_hash), 'w') as entry:
                            json.dump(cache_entry, entry)
        else:  # rel_path and package are both None
            raise BuildException(
                "Leaf nodes must define either a %s or %s key" %
                (RESERVED['file'], RESERVED['package']))