예제 #1
0
    def read_parquet(self, path, columns=None, metadata=None, schema=None,
                     nthreads=1, use_pandas_metadata=False):
        """
        Read Parquet data from path in file system. Can read from a single file
        or a directory of files

        Parameters
        ----------
        path : str
            Single file path or directory
        columns : List[str], optional
            Subset of columns to read
        metadata : pyarrow.parquet.FileMetaData
            Known metadata to validate files against
        schema : pyarrow.parquet.Schema
            Known schema to validate files against. Alternative to metadata
            argument
        nthreads : int, default 1
            Number of columns to read in parallel. If > 1, requires that the
            underlying file source is threadsafe
        use_pandas_metadata : boolean, default False
            If True and file has custom pandas schema metadata, ensure that
            index columns are also loaded

        Returns
        -------
        table : pyarrow.Table
        """
        from pyarrow.parquet import ParquetDataset
        dataset = ParquetDataset(path, schema=schema, metadata=metadata,
                                 filesystem=self)
        return dataset.read(columns=columns, nthreads=nthreads,
                            use_pandas_metadata=use_pandas_metadata)
예제 #2
0
파일: filesystem.py 프로젝트: rok/arrow
    def read_parquet(self, path, columns=None, metadata=None, schema=None,
                     use_threads=True, use_pandas_metadata=False):
        """
        Read Parquet data from path in file system. Can read from a single file
        or a directory of files

        Parameters
        ----------
        path : str
            Single file path or directory
        columns : List[str], optional
            Subset of columns to read
        metadata : pyarrow.parquet.FileMetaData
            Known metadata to validate files against
        schema : pyarrow.parquet.Schema
            Known schema to validate files against. Alternative to metadata
            argument
        use_threads : boolean, default True
            Perform multi-threaded column reads
        use_pandas_metadata : boolean, default False
            If True and file has custom pandas schema metadata, ensure that
            index columns are also loaded

        Returns
        -------
        table : pyarrow.Table
        """
        from pyarrow.parquet import ParquetDataset
        dataset = ParquetDataset(path, schema=schema, metadata=metadata,
                                 filesystem=self)
        return dataset.read(columns=columns, use_threads=use_threads,
                            use_pandas_metadata=use_pandas_metadata)
예제 #3
0
파일: filesystem.py 프로젝트: quinnj/arrow
    def read_parquet(self,
                     path,
                     columns=None,
                     metadata=None,
                     schema=None,
                     use_threads=True,
                     use_pandas_metadata=False):
        """
        Read Parquet data from path in file system. Can read from a single file
        or a directory of files.

        Parameters
        ----------
        path : str
            Single file path or directory
        columns : List[str], optional
            Subset of columns to read.
        metadata : pyarrow.parquet.FileMetaData
            Known metadata to validate files against.
        schema : pyarrow.parquet.Schema
            Known schema to validate files against. Alternative to metadata
            argument.
        use_threads : bool, default True
            Perform multi-threaded column reads.
        use_pandas_metadata : bool, default False
            If True and file has custom pandas schema metadata, ensure that
            index columns are also loaded.

        Returns
        -------
        table : pyarrow.Table
        """
        from pyarrow.parquet import ParquetDataset
        dataset = ParquetDataset(path,
                                 schema=schema,
                                 metadata=metadata,
                                 filesystem=self)
        return dataset.read(columns=columns,
                            use_threads=use_threads,
                            use_pandas_metadata=use_pandas_metadata)
async def test_local_arrow_storage_provider(
        tmp_path: Path, test_values: dt_test_values) -> None:
    test_table, visit_ids = test_values
    structured_provider = LocalArrowProvider(tmp_path)
    await structured_provider.init()
    for table_name, test_data in test_table.items():
        await structured_provider.store_record(TableName(table_name),
                                               test_data["visit_id"],
                                               test_data)
    token_list = []
    for i in visit_ids:
        token_list.append(await structured_provider.finalize_visit_id(i))
    await structured_provider.flush_cache()
    await asyncio.gather(*token_list)
    for table_name, test_data in test_table.items():
        dataset = ParquetDataset(tmp_path / table_name)
        df: DataFrame = dataset.read().to_pandas()
        assert df.shape[0] == 1
        for row in df.itertuples(index=False):
            if test_data["visit_id"] == INVALID_VISIT_ID:
                del test_data["visit_id"]
            assert row._asdict() == test_data
예제 #5
0
    def read(cls, path, engine, columns, **kwargs):
        """Load a parquet object from the file path, returning a DataFrame.
           Ray DataFrame only supports pyarrow engine for now.

        Args:
            path: The filepath of the parquet file.
                  We only support local files for now.
            engine: Ray only support pyarrow reader.
                    This argument doesn't do anything for now.
            kwargs: Pass into parquet's read_pandas function.

        Notes:
            ParquetFile API is used. Please refer to the documentation here
            https://arrow.apache.org/docs/python/parquet.html
        """
        from pyarrow.parquet import ParquetFile, ParquetDataset
        from modin.pandas.io import PQ_INDEX_REGEX

        if os.path.isdir(path):
            partitioned_columns = set()
            directory = True
            # We do a tree walk of the path directory because partitioned
            # parquet directories have a unique column at each directory level.
            # Thus, we can use os.walk(), which does a dfs search, to walk
            # through the different columns that the data is partitioned on
            for (root, dir_names, files) in os.walk(path):
                if dir_names:
                    partitioned_columns.add(dir_names[0].split("=")[0])
                if files:
                    # Metadata files, git files, .DSStore
                    if files[0][0] == ".":
                        continue
                    break
            partitioned_columns = list(partitioned_columns)
            if len(partitioned_columns):
                ErrorMessage.default_to_pandas("Mixed Partitioning Columns in Parquet")
                return cls.single_worker_read(
                    path, engine=engine, columns=columns, **kwargs
                )
        else:
            directory = False
        if not columns:
            if directory:
                # Path of the sample file that we will read to get the remaining columns
                pd = ParquetDataset(path)
                column_names = pd.schema.names
            else:
                pf = ParquetFile(path)
                column_names = pf.metadata.schema.names
            columns = [name for name in column_names if not PQ_INDEX_REGEX.match(name)]
        return cls.build_query_compiler(path, columns, **kwargs)
예제 #6
0
 def preprocess_data(self):
     from table_utils import filter_outliers, sample
     self.shard = None
     with S3() as s3:
         from pyarrow.parquet import ParquetDataset
         if self.input:
             objs = s3.get_many(self.input)
             table = ParquetDataset([obj.path for obj in objs]).read()
             table = sample(filter_outliers(table, FIELDS), self.sample)
             self.shard = {
                 field: table[field].to_numpy()
                 for field in FIELDS
             }
     self.next(self.join)
예제 #7
0
 def preprocess_data(self):
     with S3() as s3:
         from pyarrow.parquet import ParquetDataset
         if self.input:
             objs = s3.get_many(self.input)
             orig_table = ParquetDataset([obj.path for obj in objs]).read()
             self.num_rows_before = orig_table.num_rows
             table = process_data(orig_table)
             self.num_rows_after = table.num_rows
             print('selected %d/%d rows'\
                   % (self.num_rows_after, self.num_rows_before))
             self.lat = table['pickup_latitude'].to_numpy()
             self.lon = table['pickup_longitude'].to_numpy()
     self.next(self.join)
예제 #8
0
def _build_node(build_dir,
                package,
                node_path,
                node,
                checks_contents=None,
                dry_run=False,
                env='default',
                ancestor_args={}):
    """
    Parameters
    ----------
    ancestor_args : dict
      any transform inherited from an ancestor
      plus any inherited handler kwargs
      Users can thus define kwargs that affect entire subtrees
      (e.g. transform: csv for 500 .txt files)
      and overriding of ancestor or peer values.
      Child transform or kwargs override ancestor k:v pairs.
    """
    if _is_internal_node(node):
        if not dry_run:
            package.save_group(node_path, None)

        # Make a consumable copy.  This is to cover a quirk introduced by accepting nodes named
        # like RESERVED keys -- if a RESERVED key is actually matched, it should be removed from
        # the node, or it gets treated like a subnode (or like a node with invalid content)
        node = node.copy()

        # NOTE: YAML parsing does not guarantee key order
        # fetch local transform and kwargs values; we do it using ifs
        # to prevent `key: None` from polluting the update
        local_args = _get_local_args(
            node, [RESERVED['transform'], RESERVED['kwargs']])
        group_args = ancestor_args.copy()
        group_args.update(local_args)
        _consume(node, local_args)

        # if it's not a reserved word it's a group that we can descend
        groups = {k: v for k, v in iteritems(node) if _is_valid_group(v)}
        _consume(node, groups)

        if node:
            # Unused keys -- either keyword typos or node names with invalid values.
            #   For now, until build.yml schemas, pointing out one should do.
            key, value = node.popitem()
            raise BuildException(
                "Invalid syntax: expected node data for {!r}, got {!r}".format(
                    key, value))
        for child_name, child_table in groups.items():
            if glob.has_magic(child_name):
                # child_name is a glob string, use it to generate multiple child nodes
                for gchild_name, gchild_table in _gen_glob_data(
                        build_dir, child_name, child_table):
                    _build_node(build_dir,
                                package,
                                node_path + [gchild_name],
                                gchild_table,
                                checks_contents=checks_contents,
                                dry_run=dry_run,
                                env=env,
                                ancestor_args=group_args)
            else:
                if not isinstance(child_name,
                                  str) or not is_nodename(child_name):
                    raise StoreException("Invalid node name: %r" % child_name)
                _build_node(build_dir,
                            package,
                            node_path + [child_name],
                            child_table,
                            checks_contents=checks_contents,
                            dry_run=dry_run,
                            env=env,
                            ancestor_args=group_args)
    else:  # leaf node
        # prevent overwriting existing node names
        if '/'.join(node_path) in package:
            raise BuildException(
                "Naming conflict: {!r} added to package more than once".format(
                    '/'.join(node_path)))
        # handle group leaf nodes (empty groups)
        if not node:
            if not dry_run:
                package.save_group(node_path, None)
            return

        include_package = node.get(RESERVED['package'])
        rel_path = node.get(RESERVED['file'])
        if rel_path and include_package:
            raise BuildException(
                "A node must define only one of {0} or {1}".format(
                    RESERVED['file'], RESERVED['package']))
        elif include_package:  # package composition
            team, user, pkgname, subpath = parse_package(include_package,
                                                         allow_subpath=True)
            existing_pkg = PackageStore.find_package(team, user, pkgname)
            if existing_pkg is None:
                raise BuildException("Package not found: %s" % include_package)

            if subpath:
                try:
                    node = existing_pkg["/".join(subpath)]
                except KeyError:
                    msg = "Package {team}:{owner}/{pkg} has no subpackage: {subpath}"
                    raise BuildException(
                        msg.format(team=team,
                                   owner=user,
                                   pkg=pkgname,
                                   subpath=subpath))
            else:
                node = GroupNode(existing_pkg.get_contents().children)
            package.save_package_tree(node_path, node)
        elif rel_path:  # handle nodes built from input files
            path = os.path.join(build_dir, rel_path)

            rel_meta_path = node.get(RESERVED['meta'])
            if rel_meta_path:
                with open(os.path.join(build_dir, rel_meta_path)) as fd:
                    try:
                        metadata = json.load(fd)
                    except ValueError as ex:
                        raise BuildException("Failed to parse %r as JSON: %s" %
                                             (rel_meta_path, ex))
                    if SYSTEM_METADATA in metadata:
                        raise BuildException(
                            "Invalid metadata in %r: not allowed to use key %r"
                            % (rel_meta_path, SYSTEM_METADATA))
            else:
                metadata = None

            # get either the locally defined transform and target or inherit from an ancestor
            transform = node.get(RESERVED['transform']) or ancestor_args.get(
                RESERVED['transform'])

            ID = 'id'  # pylint:disable=C0103
            PARQUET = 'parquet'  # pylint:disable=C0103
            if transform:
                transform = transform.lower()
                if transform in PANDAS_PARSERS:
                    target = TargetType.PANDAS
                elif transform == PARQUET:
                    target = TargetType.PANDAS
                elif transform == ID:
                    target = TargetType.FILE
                else:
                    raise BuildException("Unknown transform '%s' for %s" %
                                         (transform, rel_path))
            else:
                # Guess transform and target based on file extension if not provided
                _, ext = splitext_no_dot(rel_path)

                if ext in PANDAS_PARSERS:
                    transform = ext
                    target = TargetType.PANDAS
                elif ext == PARQUET:
                    transform = ext
                    target = TargetType.PANDAS
                else:
                    transform = ID
                    target = TargetType.FILE
                print("Inferring 'transform: %s' for %s" %
                      (transform, rel_path))

            # TODO: parse/check environments:
            # environments = node.get(RESERVED['environments'])
            checks = node.get(RESERVED['checks'])
            if transform == ID:
                #TODO move this to a separate function
                if checks:
                    with open(path, 'r') as fd:
                        data = fd.read()
                        _run_checks(data,
                                    checks,
                                    checks_contents,
                                    node_path,
                                    rel_path,
                                    target,
                                    env=env)
                if not dry_run:
                    print("Registering %s..." % path)
                    package.save_file(path, node_path, target, rel_path,
                                      transform, metadata)
            elif transform == PARQUET:
                if checks:
                    from pyarrow.parquet import ParquetDataset
                    dataset = ParquetDataset(path)
                    table = dataset.read(nthreads=4)
                    dataframe = table.to_pandas()
                    _run_checks(dataframe,
                                checks,
                                checks_contents,
                                node_path,
                                rel_path,
                                target,
                                env=env)
                if not dry_run:
                    print("Registering %s..." % path)
                    package.save_file(path, node_path, target, rel_path,
                                      transform, metadata)
            else:
                # copy so we don't modify shared ancestor_args
                handler_args = dict(ancestor_args.get(RESERVED['kwargs'], {}))
                # local kwargs win the update
                handler_args.update(node.get(RESERVED['kwargs'], {}))
                # Check Cache
                store = PackageStore()
                path_hash = _path_hash(path, transform, handler_args)
                source_hash = digest_file(path)

                cachedobjs = []
                if os.path.exists(store.cache_path(path_hash)):
                    with open(store.cache_path(path_hash), 'r') as entry:
                        cache_entry = json.load(entry)
                        if cache_entry['source_hash'] == source_hash:
                            cachedobjs = cache_entry['obj_hashes']
                            assert isinstance(cachedobjs, list)

                # TODO: check for changes in checks else use cache
                # below is a heavy-handed fix but it's OK for check builds to be slow
                if not checks and cachedobjs and all(
                        os.path.exists(store.object_path(obj))
                        for obj in cachedobjs):
                    # Use existing objects instead of rebuilding
                    package.save_cached_df(cachedobjs, node_path, target,
                                           rel_path, transform, metadata)
                else:
                    # read source file into DataFrame
                    print("Serializing %s..." % path)
                    if _have_pyspark():
                        dataframe = _file_to_spark_data_frame(
                            transform, path, handler_args)
                    else:
                        dataframe = _file_to_data_frame(
                            transform, path, handler_args)

                    if checks:
                        # TODO: test that design works for internal nodes... e.g. iterating
                        # over the children and getting/checking the data, err msgs, etc.
                        _run_checks(dataframe,
                                    checks,
                                    checks_contents,
                                    node_path,
                                    rel_path,
                                    target,
                                    env=env)

                    # serialize DataFrame to file(s)
                    if not dry_run:
                        print("Saving as binary dataframe...")
                        obj_hashes = package.save_df(dataframe, node_path,
                                                     target, rel_path,
                                                     transform, metadata)

                        # Add to cache
                        cache_entry = dict(source_hash=source_hash,
                                           obj_hashes=obj_hashes)
                        with open(store.cache_path(path_hash), 'w') as entry:
                            json.dump(cache_entry, entry)
        else:  # rel_path and package are both None
            raise BuildException(
                "Leaf nodes must define either a %s or %s key" %
                (RESERVED['file'], RESERVED['package']))
예제 #9
0
파일: io.py 프로젝트: zuoxiaolei/modin
    def read_parquet(cls, path, engine, columns, **kwargs):
        """Load a parquet object from the file path, returning a DataFrame.
           Ray DataFrame only supports pyarrow engine for now.

        Args:
            path: The filepath of the parquet file.
                  We only support local files for now.
            engine: Ray only support pyarrow reader.
                    This argument doesn't do anything for now.
            kwargs: Pass into parquet's read_pandas function.

        Notes:
            ParquetFile API is used. Please refer to the documentation here
            https://arrow.apache.org/docs/python/parquet.html
        """

        from pyarrow.parquet import ParquetFile, ParquetDataset

        if cls.read_parquet_remote_task is None:
            return super(RayIO, cls).read_parquet(path, engine, columns,
                                                  **kwargs)

        file_path = path
        if os.path.isdir(path):
            directory = True
            partitioned_columns = set()
            # We do a tree walk of the path directory because partitioned
            # parquet directories have a unique column at each directory level.
            # Thus, we can use os.walk(), which does a dfs search, to walk
            # through the different columns that the data is partitioned on
            for (root, dir_names, files) in os.walk(path):
                if dir_names:
                    partitioned_columns.add(dir_names[0].split("=")[0])
                if files:
                    # Metadata files, git files, .DSStore
                    if files[0][0] == ".":
                        continue
                    file_path = os.path.join(root, files[0])
                    break
            partitioned_columns = list(partitioned_columns)
        else:
            directory = False

        if not columns:
            if directory:
                # Path of the sample file that we will read to get the remaining
                # columns.
                from pyarrow import ArrowIOError

                try:
                    pd = ParquetDataset(file_path)
                except ArrowIOError:
                    pd = ParquetDataset(path)
                column_names = pd.schema.names
            else:
                pf = ParquetFile(path)
                column_names = pf.metadata.schema.names
            columns = [
                name for name in column_names if not PQ_INDEX_REGEX.match(name)
            ]

        # Cannot read in parquet file by only reading in the partitioned column.
        # Thus, we have to remove the partition columns from the columns to
        # ensure that when we do the math for the blocks, the partition column
        # will be read in along with a non partition column.
        if columns and directory and any(col in partitioned_columns
                                         for col in columns):
            columns = [
                col for col in columns if col not in partitioned_columns
            ]
            # If all of the columns wanted are partition columns, return an
            # empty dataframe with the desired columns.
            if len(columns) == 0:
                return cls.query_compiler_cls.from_pandas(
                    pandas.DataFrame(columns=partitioned_columns),
                    block_partitions_cls=cls.frame_mgr_cls,
                )

        num_partitions = cls.frame_mgr_cls._compute_num_partitions()
        num_splits = min(len(columns), num_partitions)
        # Each item in this list will be a list of column names of the original df
        column_splits = (len(columns) // num_partitions if len(columns) %
                         num_partitions == 0 else
                         len(columns) // num_partitions + 1)
        col_partitions = [
            columns[i:i + column_splits]
            for i in range(0, len(columns), column_splits)
        ]
        column_widths = [len(c) for c in col_partitions]
        # Each item in this list will be a list of columns of original df
        # partitioned to smaller pieces along rows.
        # We need to transpose the oids array to fit our schema.
        # TODO (williamma12): This part can be parallelized even more if we
        # separate the partitioned parquet file code path from the default one.
        # The workers return multiple objects for each part of the file read:
        # - The first n - 2 objects are partitions of data
        # - The n - 1 object is the length of the partition.
        # - The nth object is the dtypes of the partition. We combine these to
        #   form the final dtypes below.
        blk_partitions = np.array([
            cls.read_parquet_remote_task._remote(
                args=(path, cols + partitioned_columns, num_splits, kwargs),
                num_return_vals=num_splits + 2,
            ) if directory and cols == col_partitions[len(col_partitions) - 1]
            else cls.read_parquet_remote_task._remote(
                args=(path, cols, num_splits, kwargs),
                num_return_vals=num_splits + 2,
            ) for cols in col_partitions
        ]).T
        # Metadata
        index_len = ray.get(blk_partitions[-2][0])
        index = pandas.RangeIndex(index_len)
        index_chunksize = compute_chunksize(pandas.DataFrame(index=index),
                                            num_splits,
                                            axis=0)
        if index_chunksize > index_len:
            row_lengths = [index_len] + [0 for _ in range(num_splits - 1)]
        else:
            row_lengths = [
                index_chunksize if i != num_splits - 1 else index_len -
                (index_chunksize * (num_splits - 1)) for i in range(num_splits)
            ]
        # Compute dtypes concatenating the results from each of the columns splits
        # determined above. This creates a pandas Series that contains a dtype for every
        # column.
        dtypes_ids = list(blk_partitions[-1])
        dtypes = pandas.concat(ray.get(dtypes_ids), axis=0)

        blk_partitions = blk_partitions[:-2]
        remote_partitions = np.array([[
            cls.frame_partition_cls(
                blk_partitions[i][j],
                length=row_lengths[i],
                width=column_widths[j],
            ) for j in range(len(blk_partitions[i]))
        ] for i in range(len(blk_partitions))])
        if directory:
            columns += partitioned_columns
        dtypes.index = columns
        new_query_compiler = cls.query_compiler_cls(
            cls.frame_mgr_cls(remote_partitions),
            index,
            columns,
            dtypes=dtypes)

        return new_query_compiler
예제 #10
0
    def _read(cls, path, engine, columns, **kwargs):
        """
        Load a parquet object from the file path, returning a query compiler.

        Parameters
        ----------
        path : str, path object or file-like object
            The filepath of the parquet file in local filesystem or hdfs.
        engine : str
            Parquet library to use (only 'PyArrow' is supported for now).
        columns : list
            If not None, only these columns will be read from the file.
        **kwargs : dict
            Keyword arguments.

        Returns
        -------
        BaseQueryCompiler
            A new Query Compiler.

        Notes
        -----
        ParquetFile API is used. Please refer to the documentation here
        https://arrow.apache.org/docs/python/parquet.html
        """
        from pyarrow.parquet import ParquetDataset
        from modin.pandas.io import PQ_INDEX_REGEX

        if isinstance(path, str) and os.path.isdir(path):
            partitioned_columns = set()
            # We do a tree walk of the path directory because partitioned
            # parquet directories have a unique column at each directory level.
            # Thus, we can use os.walk(), which does a dfs search, to walk
            # through the different columns that the data is partitioned on
            for (root, dir_names, files) in os.walk(path):
                if dir_names:
                    partitioned_columns.add(dir_names[0].split("=")[0])
                if files:
                    # Metadata files, git files, .DSStore
                    if files[0][0] == ".":
                        continue
                    break
            partitioned_columns = list(partitioned_columns)
            if len(partitioned_columns):
                ErrorMessage.default_to_pandas("Mixed Partitioning Columns in Parquet")
                return cls.single_worker_read(
                    path, engine=engine, columns=columns, **kwargs
                )

        if not columns:
            import fsspec.core
            from pandas.io.common import is_fsspec_url

            fs, path_ = (
                fsspec.core.url_to_fs(path, **(kwargs.get("storage_options") or {}))
                if is_fsspec_url(path)
                else (None, path)
            )

            dataset = ParquetDataset(path_, filesystem=fs, use_legacy_dataset=False)
            column_names = dataset.schema.names

            if dataset.schema.pandas_metadata is not None:
                index_columns = dataset.schema.pandas_metadata.get("index_columns", [])
                column_names = [c for c in column_names if c not in index_columns]
            columns = [name for name in column_names if not PQ_INDEX_REGEX.match(name)]
        return cls.build_query_compiler(path, columns, **kwargs)
예제 #11
0
    def _read(cls, path, engine, columns, **kwargs):
        """
        Load a parquet object from the file path, returning a query compiler.

        Parameters
        ----------
        path : str, path object or file-like object
            The filepath of the parquet file in local filesystem or hdfs.
        engine : str
            Parquet library to use (only 'PyArrow' is supported for now).
        columns : list
            If not None, only these columns will be read from the file.
        **kwargs : dict
            Keyword arguments.

        Returns
        -------
        BaseQueryCompiler
            A new Query Compiler.

        Notes
        -----
        ParquetFile API is used. Please refer to the documentation here
        https://arrow.apache.org/docs/python/parquet.html
        """
        from pyarrow.parquet import ParquetFile, ParquetDataset
        from modin.pandas.io import PQ_INDEX_REGEX

        if isinstance(path, str) and os.path.isdir(path):
            partitioned_columns = set()
            directory = True
            # We do a tree walk of the path directory because partitioned
            # parquet directories have a unique column at each directory level.
            # Thus, we can use os.walk(), which does a dfs search, to walk
            # through the different columns that the data is partitioned on
            for (root, dir_names, files) in os.walk(path):
                if dir_names:
                    partitioned_columns.add(dir_names[0].split("=")[0])
                if files:
                    # Metadata files, git files, .DSStore
                    if files[0][0] == ".":
                        continue
                    break
            partitioned_columns = list(partitioned_columns)
            if len(partitioned_columns):
                ErrorMessage.default_to_pandas(
                    "Mixed Partitioning Columns in Parquet")
                return cls.single_worker_read(path,
                                              engine=engine,
                                              columns=columns,
                                              **kwargs)
        else:
            directory = False
        if not columns:
            import s3fs

            if directory:
                # Path of the sample file that we will read to get the remaining columns
                pd = ParquetDataset(path)
                meta = pd.metadata
                column_names = pd.schema.to_arrow_schema().names
            elif isinstance(path, str) and path.startswith("hdfs://"):
                import fsspec.core

                fs, path = fsspec.core.url_to_fs(path)
                pd = ParquetDataset(path, filesystem=fs)
                meta = pd.metadata
                column_names = pd.schema.to_arrow_schema().names
            elif isinstance(path,
                            s3fs.S3File) or (isinstance(path, str)
                                             and path.startswith("s3://")):
                from botocore.exceptions import NoCredentialsError

                if isinstance(path, s3fs.S3File):
                    bucket_path = path.url().split(".s3.amazonaws.com")
                    path = "s3://" + bucket_path[0].split(
                        "://")[1] + bucket_path[1]
                try:
                    fs = s3fs.S3FileSystem()
                    pd = ParquetDataset(path, filesystem=fs)
                except NoCredentialsError:
                    fs = s3fs.S3FileSystem(anon=True)
                    pd = ParquetDataset(path, filesystem=fs)
                meta = pd.metadata
                column_names = pd.schema.to_arrow_schema().names
            else:
                meta = ParquetFile(path).metadata
                column_names = meta.schema.to_arrow_schema().names

            if meta is not None and meta.metadata is not None:
                pandas_metadata = meta.metadata.get(b"pandas", None)
                if pandas_metadata is not None:
                    import json

                    # This is how we convert the metadata from pyarrow to a python
                    # dictionary, from which we then get the index columns.
                    # We use these to filter out from the columns in the metadata since
                    # the pyarrow storage has no concept of row labels/index.
                    # This ensures that our metadata lines up with the partitions without
                    # extra communication steps once we have done all the remote
                    # computation.
                    index_columns = json.loads(
                        pandas_metadata.decode("utf8")).get(
                            "index_columns", [])
                    column_names = [
                        c for c in column_names if c not in index_columns
                    ]
            columns = [
                name for name in column_names if not PQ_INDEX_REGEX.match(name)
            ]
        return cls.build_query_compiler(path, columns, **kwargs)
예제 #12
0
    def read(cls, path, engine, columns, **kwargs):
        """Load a parquet object from the file path, returning a Modin DataFrame.
           Modin only supports pyarrow engine for now.

        Args:
            path: The filepath of the parquet file.
                  We only support local files for now.
            engine: Modin only supports pyarrow reader.
                    This argument doesn't do anything for now.
            kwargs: Pass into parquet's read_pandas function.

        Notes:
            ParquetFile API is used. Please refer to the documentation here
            https://arrow.apache.org/docs/python/parquet.html
        """
        from pyarrow.parquet import ParquetFile, ParquetDataset
        from modin.pandas.io import PQ_INDEX_REGEX

        if os.path.isdir(path):
            partitioned_columns = set()
            directory = True
            # We do a tree walk of the path directory because partitioned
            # parquet directories have a unique column at each directory level.
            # Thus, we can use os.walk(), which does a dfs search, to walk
            # through the different columns that the data is partitioned on
            for (root, dir_names, files) in os.walk(path):
                if dir_names:
                    partitioned_columns.add(dir_names[0].split("=")[0])
                if files:
                    # Metadata files, git files, .DSStore
                    if files[0][0] == ".":
                        continue
                    break
            partitioned_columns = list(partitioned_columns)
            if len(partitioned_columns):
                ErrorMessage.default_to_pandas(
                    "Mixed Partitioning Columns in Parquet")
                return cls.single_worker_read(path,
                                              engine=engine,
                                              columns=columns,
                                              **kwargs)
        else:
            directory = False
        if not columns:
            if directory:
                # Path of the sample file that we will read to get the remaining columns
                pd = ParquetDataset(path)
                meta = pd.metadata
                column_names = pd.schema.names
            else:
                meta = ParquetFile(path).metadata
                column_names = meta.schema.names
            if meta is not None:
                # This is how we convert the metadata from pyarrow to a python
                # dictionary, from which we then get the index columns.
                # We use these to filter out from the columns in the metadata since
                # the pyarrow storage has no concept of row labels/index.
                # This ensures that our metadata lines up with the partitions without
                # extra communication steps once we `have done all the remote
                # computation.
                index_columns = eval(meta.metadata[b"pandas"].replace(
                    b"null", b"None")).get("index_columns", [])
                column_names = [
                    c for c in column_names if c not in index_columns
                ]
            columns = [
                name for name in column_names if not PQ_INDEX_REGEX.match(name)
            ]
        return cls.build_query_compiler(path, columns, **kwargs)
예제 #13
0
def read_parquet(path, columns, **kwargs):
    from legate.core import Rect

    from .runtime import _runtime as rt

    path = util.to_list_if_scalar(path)

    if len(path) == 1 and os.path.isdir(path[0]):
        from pyarrow.parquet import ParquetDataset

        ds = ParquetDataset(path)
        path = [piece.path for piece in ds.pieces]
    else:
        from pyarrow.parquet import ParquetFile

        ds = ParquetFile(path[0])
        if rt.debug:
            assert all(ParquetFile(p).schema == ds.schema for p in path)

    dedup_names = set()
    for name in ds.schema.names:
        if name in dedup_names:
            raise ValueError(
                "Duplicate column names in schema are not supported.")
        dedup_names.add(name)

    schema = ds.schema.to_arrow_schema()
    index_descs = []
    index_materialized = False
    if str.encode("pandas") in ds.metadata.metadata:
        import json

        pandas_metadata = json.loads(
            ds.metadata.metadata[str.encode("pandas")])
        index_descs = pandas_metadata["index_columns"]
        index_materialized = len(index_descs) > 0 and all(
            isinstance(desc, str) for desc in index_descs)

    if columns is None:
        column_names = schema.names
    elif index_materialized:
        column_names = columns + index_descs
    else:
        column_names = columns

    for name in column_names:
        if name not in dedup_names:
            raise ValueError("Field named %s not found in the schema." % name)
    schema = [schema.field(name) for name in column_names]
    del columns

    storage = rt.create_output_storage()
    offsets_storage = None

    columns = []
    for column_info in schema:
        dtype = ty.to_legate_dtype(column_info.type)
        column = storage.create_column(dtype)
        if ty.is_string_dtype(dtype):
            if offsets_storage is None:
                offsets_storage = rt.create_output_storage()
            offsets_column = offsets_storage.create_column(ty.int32,
                                                           nullable=False)
            chars_storage = rt.create_output_storage()
            char_column = chars_storage.create_column(ty.int8, nullable=False)
            column.add_child(offsets_column)
            column.add_child(char_column)
            column = column.as_string_column()
        columns.append(column)

    plan = Map(rt, OpCode.READ_PARQUET)
    plan.add_scalar_arg(len(path), ty.uint32)
    for f in path:
        plan.add_scalar_arg(f, ty.string)
    plan.add_scalar_arg(len(column_names), ty.uint32)
    for name in column_names:
        plan.add_scalar_arg(name, ty.string)
    plan.add_scalar_arg(len(columns), ty.uint32)
    for column in columns:
        column.add_to_plan_output_only(plan)
    counts = plan.execute(Rect([rt.num_pieces]))
    storage = plan.promote_output_storage(storage)
    rt.register_external_weighted_partition(storage.default_ipart, counts)
    del plan

    size = counts.cast(ty.int64).sum()

    if index_materialized:
        to_filter = set(index_descs)

        index_columns = []
        value_columns = []
        value_column_names = []
        for idx, name in enumerate(column_names):
            if name in to_filter:
                index_columns.append(columns[idx])
            else:
                value_columns.append(columns[idx])
                value_column_names.append(column_names[idx])

        sanitized_names = [
            None if name == f"__index_level_{level}__" else name
            for level, name in enumerate(index_descs)
        ]
        index = create_index_from_columns(index_columns, size, sanitized_names)
    else:
        value_columns = columns
        value_column_names = column_names
        if len(index_descs) > 0:
            assert len(index_descs) == 1
            index_desc = index_descs[0]
            name = index_desc["name"]
            start = rt.create_future(index_desc["start"], ty.int64)
            stop = rt.create_future(index_desc["stop"], ty.int64)
            step = rt.create_future(index_desc["step"], ty.int64)
            index = create_range_index(storage, size, name, start, stop, step)
        else:
            index = create_range_index(storage, size)

    from pandas import Index

    return {
        "frame": Table(rt, index, value_columns),
        "columns": Index(value_column_names),
    }