Exemplo n.º 1
0
    def read_partition(fs,
                       piece,
                       columns,
                       index,
                       categories=(),
                       pf=None,
                       **kwargs):
        if isinstance(index, list):
            columns += index

        if pf is None:
            base, fns = _analyze_paths([piece], fs)
            scheme = get_file_scheme(fns)
            pf = ParquetFile(piece, open_with=fs.open)
            relpath = piece.replace(base, "").lstrip("/")
            for rg in pf.row_groups:
                for ch in rg.columns:
                    ch.file_path = relpath
            pf.file_scheme = scheme
            pf.cats = _paths_to_cats(fns, scheme)
            pf.fn = base
            return pf.to_pandas(columns, categories, index=index)
        else:
            if isinstance(pf, tuple):
                pf = _determine_pf_parts(fs, pf[0], pf[1], **kwargs)[1]
                pf._dtypes = lambda *args: pf.dtypes  # ugly patch, could be fixed
                pf.fmd.row_groups = None
            rg_piece = pf.row_groups[piece]
            pf.fmd.key_value_metadata = None
            return pf.read_row_group_file(rg_piece,
                                          columns,
                                          categories,
                                          index=index,
                                          **kwargs.get("read", {}))
Exemplo n.º 2
0
    def read_partition(
        cls, fs, piece, columns, index, categories=(), pf=None, **kwargs
    ):

        null_index_name = False
        if isinstance(index, list):
            if index == [None]:
                # Handling a None-labeled index...
                # The pandas metadata told us to read in an index
                # labeled `None`. If this corresponds to a `RangeIndex`,
                # fastparquet will need use the pandas metadata to
                # construct the index. Otherwise, the index will correspond
                # to a column named "__index_level_0__".  We will need to
                # check the `ParquetFile` object for this column below.
                index = []
                null_index_name = True
            columns += index

        if pf is None:
            base, fns = _analyze_paths([piece], fs)
            scheme = get_file_scheme(fns)
            pf = ParquetFile(piece, open_with=fs.open)
            relpath = piece.replace(base, "").lstrip("/")
            for rg in pf.row_groups:
                for ch in rg.columns:
                    ch.file_path = relpath
            pf.file_scheme = scheme
            pf.cats = paths_to_cats(fns, scheme)
            pf.fn = base
            if null_index_name and "__index_level_0__" in pf.columns:
                # See "Handling a None-labeled index" comment above
                index = ["__index_level_0__"]
                columns += index
            return pf.to_pandas(columns, categories, index=index)
        else:
            if isinstance(pf, tuple):
                if isinstance(pf[0], list):
                    pf = _determine_pf_parts(fs, pf[0], pf[1], **kwargs)[1]
                else:
                    pf = ParquetFile(
                        pf[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})
                    )
                pf._dtypes = lambda *args: pf.dtypes  # ugly patch, could be fixed
                pf.fmd.row_groups = None
            rg_piece = pf.row_groups[piece]
            if null_index_name:
                if "__index_level_0__" in pf.columns:
                    # See "Handling a None-labeled index" comment above
                    index = ["__index_level_0__"]
                    columns += index
                    pf.fmd.key_value_metadata = None
            else:
                pf.fmd.key_value_metadata = None
            return pf.read_row_group_file(
                rg_piece, columns, categories, index=index, **kwargs.get("read", {})
            )
Exemplo n.º 3
0
    def read_partition(cls, fs, piece, columns, index, categories=(),
                       **kwargs):

        null_index_name = False
        if isinstance(index, list):
            if index == [None]:
                # Handling a None-labeled index...
                # The pandas metadata told us to read in an index
                # labeled `None`. If this corresponds to a `RangeIndex`,
                # fastparquet will need use the pandas metadata to
                # construct the index. Otherwise, the index will correspond
                # to a column named "__index_level_0__".  We will need to
                # check the `ParquetFile` object for this column below.
                index = []
                null_index_name = True
            columns += index

        # Use global `parquet_file` object.  Need to reattach
        # the desired row_group
        parquet_file = kwargs.pop("parquet_file", None)

        if isinstance(piece, tuple):
            if isinstance(piece[0], str):
                # We have a path to read from
                assert parquet_file is None
                parquet_file = ParquetFile(piece[0],
                                           open_with=fs.open,
                                           sep=fs.sep,
                                           **kwargs.get("file", {}))
                rg_indices = piece[1] or list(
                    range(len(parquet_file.row_groups)))

                # `piece[1]` will contain row-group indices
                row_groups = [parquet_file.row_groups[rg] for rg in rg_indices]
            elif parquet_file:
                # `piece[1]` will contain actual row-group objects,
                # but they may be pickled
                row_groups = piece[0]
                if isinstance(row_groups, bytes):
                    row_groups = pickle.loads(row_groups)
                parquet_file.fmd.row_groups = row_groups
                # NOTE: May lose cats after `_set_attrs` call
                save_cats = parquet_file.cats
                parquet_file._set_attrs()
                parquet_file.cats = save_cats
            else:
                raise ValueError("Neither path nor ParquetFile detected!")

            if null_index_name:
                if "__index_level_0__" in parquet_file.columns:
                    # See "Handling a None-labeled index" comment above
                    index = ["__index_level_0__"]
                    columns += index

            parquet_file._dtypes = (lambda *args: parquet_file.dtypes
                                    )  # ugly patch, could be fixed

            # Read necessary row-groups and concatenate
            dfs = []
            for row_group in row_groups:
                dfs.append(
                    parquet_file.read_row_group_file(
                        row_group,
                        columns,
                        categories,
                        index=index,
                        **kwargs.get("read", {}),
                    ))
            return concat(dfs, axis=0) if len(dfs) > 1 else dfs[0]

        else:
            # `piece` is NOT a tuple
            raise ValueError(f"Expected tuple, got {type(piece)}")
Exemplo n.º 4
0
    def read_metadata(fs,
                      paths,
                      categories=None,
                      index=None,
                      gather_statistics=None,
                      filters=None,
                      **kwargs):
        if len(paths) > 1:
            if gather_statistics is not False:
                # this scans all the files, allowing index/divisions
                # and filtering
                pf = fastparquet.ParquetFile(paths,
                                             open_with=fs.open,
                                             sep=fs.sep,
                                             **kwargs.get("file", {}))
            else:
                base, fns = analyse_paths(paths)
                scheme = get_file_scheme(fns)
                pf = ParquetFile(paths[0],
                                 open_with=fs.open,
                                 **kwargs.get("file", {}))
                pf.file_scheme = scheme
                pf.cats = _paths_to_cats(fns, scheme)
                relpath = paths.replace(base, "").lstrip("/")
                for rg in pf.row_groups:
                    rg.cats = pf.cats
                    rg.schema = pf.schema
                    for ch in rg.columns:
                        ch.file_path = relpath
        else:
            try:
                pf = fastparquet.ParquetFile(paths[0] + fs.sep + "_metadata",
                                             open_with=fs.open,
                                             sep=fs.sep,
                                             **kwargs.get("file", {}))
                if gather_statistics is None:
                    gather_statistics = True
            except Exception:
                pf = fastparquet.ParquetFile(paths[0],
                                             open_with=fs.open,
                                             sep=fs.sep,
                                             **kwargs.get("file", {}))

        columns = None
        if pf.fmd.key_value_metadata:
            pandas_md = [
                x.value for x in pf.fmd.key_value_metadata if x.key == "pandas"
            ]
        else:
            pandas_md = []

        if len(pandas_md) == 0:
            index_names = []
            column_names = pf.columns + list(pf.cats)
            storage_name_mapping = {k: k for k in column_names}
            column_index_names = [None]

        elif len(pandas_md) == 1:
            (
                index_names,
                column_names,
                storage_name_mapping,
                column_index_names,
            ) = _parse_pandas_metadata(json.loads(pandas_md[0]))
            #  auto-ranges should not be created by fastparquet
            index_names = [n for n in index_names if n is not None]
            column_names.extend(pf.cats)

        else:
            raise ValueError("File has multiple entries for 'pandas' metadata")

        if index is None and len(index_names) > 0:
            if len(index_names) == 1:
                index = index_names[0]
            else:
                index = index_names

        # Normalize user inputs
        column_names, index_names = _normalize_index_columns(
            columns, column_names, index, index_names)

        all_columns = index_names + column_names

        categories_dict = None
        if isinstance(categories, dict):
            categories_dict = categories

        if categories is None:
            categories = pf.categories
        elif isinstance(categories, string_types):
            categories = [categories]
        else:
            categories = list(categories)

        # Check that categories are included in columns
        if categories and not set(categories).intersection(all_columns):
            raise ValueError("categories not in available columns.\n"
                             "categories: {} | columns: {}".format(
                                 categories, list(all_columns)))

        dtypes = pf._dtypes(categories)
        dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()}

        index_cols = index or ()
        meta = _meta_from_dtypes(all_columns, dtypes, index_cols,
                                 column_index_names)

        # fastparquet doesn't handle multiindex
        if len(index_names) > 1:
            raise ValueError("Cannot read DataFrame with MultiIndex.")

        for cat in categories:
            if cat in meta:
                meta[cat] = pd.Series(
                    pd.Categorical([], categories=[UNKNOWN_CATEGORIES]),
                    index=meta.index,
                )

        for catcol in pf.cats:
            if catcol in meta.columns:
                meta[catcol] = meta[catcol].cat.set_categories(pf.cats[catcol])
            elif meta.index.name == catcol:
                meta.index = meta.index.set_categories(pf.cats[catcol])

        if gather_statistics and pf.row_groups:
            stats = []
            if filters is None:
                filters = []
            # make statistics conform in layout
            for (i, row_group) in enumerate(pf.row_groups):
                s = {"num-rows": row_group.num_rows, "columns": []}
                for col in pf.columns:
                    d = {"name": col}
                    if pf.statistics["min"][col][0] is not None:
                        cs_min = pf.statistics["min"][col][i]
                        cs_max = pf.statistics["max"][col][i]
                        if isinstance(cs_min, np.datetime64):
                            cs_min = pd.Timestamp(cs_min)
                            cs_max = pd.Timestamp(cs_max)
                        d.update({
                            "min":
                            cs_min,
                            "max":
                            cs_max,
                            "null_count":
                            pf.statistics["null_count"][col][i],
                        })
                    s["columns"].append(d)
                # Need this to filter out partitioned-on categorical columns
                s["filter"] = fastparquet.api.filter_out_cats(
                    row_group, filters)
                stats.append(s)

        else:
            stats = None

        pf._dtypes = lambda *args: pf.dtypes  # ugly patch, could be fixed
        pf.fmd.row_groups = None

        # Create `parts` (list of row-group-descriptor dicts)
        parts = [{
            "piece": rg,
            "kwargs": {
                "pf": pf,
                "categories": categories_dict or categories
            },
        } for rg in pf.row_groups]

        return (meta, stats, parts)
Exemplo n.º 5
0
    def read_partition(
        cls,
        fs,
        pieces,
        columns,
        index,
        categories=(),
        root_cats=None,
        root_file_scheme=None,
        base_path=None,
        **kwargs,
    ):

        null_index_name = False
        base_path = False if not root_cats else base_path
        if isinstance(index, list):
            if index == [None]:
                # Handling a None-labeled index...
                # The pandas metadata told us to read in an index
                # labeled `None`. If this corresponds to a `RangeIndex`,
                # fastparquet will need use the pandas metadata to
                # construct the index. Otherwise, the index will correspond
                # to a column named "__index_level_0__".  We will need to
                # check the `ParquetFile` object for this column below.
                index = []
                null_index_name = True
            columns += index

        # Use global `parquet_file` object.  Need to reattach
        # the desired row_group
        parquet_file = kwargs.pop("parquet_file", None)

        # Always convert pieces to list
        if not isinstance(pieces, list):
            pieces = [pieces]

        sample = pieces[0]
        if isinstance(sample, tuple):
            if isinstance(sample[0], str):
                # We have paths to read from
                assert parquet_file is None

                row_groups = []
                rg_offset = 0
                parquet_file = ParquetFile(
                    [p[0] for p in pieces],
                    open_with=fs.open,
                    root=base_path or False,
                    **kwargs.get("file", {}),
                )
                for piece in pieces:
                    _pf = (parquet_file if len(pieces) == 1 else ParquetFile(
                        piece[0],
                        open_with=fs.open,
                        root=base_path or False,
                        **kwargs.get("file", {}),
                    ))
                    n_local_row_groups = len(_pf.row_groups)
                    local_rg_indices = piece[1] or list(
                        range(n_local_row_groups))
                    row_groups += [
                        parquet_file.row_groups[rg + rg_offset]
                        for rg in local_rg_indices
                    ]
                    rg_offset += n_local_row_groups
                update_parquet_file = len(row_groups) < len(
                    parquet_file.row_groups)

            elif parquet_file:

                row_groups = []
                for piece in pieces:
                    # `piece[1]` will contain actual row-group objects,
                    # but they may be pickled
                    rgs = piece[0]
                    if isinstance(rgs, bytes):
                        rgs = pickle.loads(rgs)
                    row_groups += rgs
                update_parquet_file = True

            else:
                raise ValueError("Neither path nor ParquetFile detected!")

            if update_parquet_file:
                with _FP_FILE_LOCK:
                    parquet_file.fmd.row_groups = row_groups
                    # NOTE: May lose cats after `_set_attrs` call
                    save_cats = parquet_file.cats
                    parquet_file._set_attrs()
                    parquet_file.cats = save_cats

            if null_index_name:
                if "__index_level_0__" in parquet_file.columns:
                    # See "Handling a None-labeled index" comment above
                    index = ["__index_level_0__"]
                    columns += index

            # Update hive-partitioning information if necessary
            parquet_file.cats = root_cats or {}
            if root_cats:
                parquet_file.file_scheme = root_file_scheme

            parquet_file._dtypes = (lambda *args: parquet_file.dtypes
                                    )  # ugly patch, could be fixed

            if set(columns).issubset(parquet_file.columns +
                                     list(parquet_file.cats.keys())):
                # Convert ParquetFile to pandas
                return parquet_file.to_pandas(
                    columns=columns,
                    categories=categories,
                    index=index,
                )
            else:
                # Read necessary row-groups and concatenate
                dfs = []
                for row_group in row_groups:
                    dfs.append(
                        parquet_file.read_row_group_file(
                            row_group,
                            columns,
                            categories,
                            index=index,
                            **kwargs.get("read", {}),
                        ))
                return concat(dfs, axis=0) if len(dfs) > 1 else dfs[0]

        else:
            # `sample` is NOT a tuple
            raise ValueError(f"Expected tuple, got {type(sample)}")