Exemplo n.º 1
0
def _read_pf_simple(fs, path, base, index_names, all_columns, is_series,
                    categories, cats, scheme, storage_name_mapping):
    """Read dataset with fastparquet using ParquetFile machinery"""
    from fastparquet import ParquetFile
    pf = ParquetFile(path, open_with=fs.open)
    relpath = path.replace(base, '').lstrip('/')
    for rg in pf.row_groups:
        for ch in rg.columns:
            ch.file_path = relpath
    pf.file_scheme = scheme
    pf.cats = cats
    pf.fn = base
    df = pf.to_pandas(all_columns, categories, index=index_names)
    if df.index.nlevels == 1:
        if index_names:
            df.index.name = storage_name_mapping.get(index_names[0],
                                                     index_names[0])
    else:
        if index_names:
            df.index.names = [storage_name_mapping.get(name, name)
                              for name in index_names]
    df.columns = [storage_name_mapping.get(col, col)
                  for col in all_columns
                  if col not in (index_names or [])]

    if is_series:
        return df[df.columns[0]]
    else:
        return df
Exemplo n.º 2
0
def _read_pf_simple(fs, path, base, index_names, all_columns, is_series,
                    categories, cats, scheme, storage_name_mapping):
    """Read dataset with fastparquet using ParquetFile machinery"""
    from fastparquet import ParquetFile
    pf = ParquetFile(path, open_with=fs.open)
    relpath = path.replace(base, '').lstrip('/')
    for rg in pf.row_groups:
        for ch in rg.columns:
            ch.file_path = relpath
    pf.file_scheme = scheme
    pf.cats = cats
    pf.fn = base
    df = pf.to_pandas(all_columns, categories, index=index_names)
    if df.index.nlevels == 1:
        if index_names:
            df.index.name = storage_name_mapping.get(index_names[0],
                                                     index_names[0])
    else:
        if index_names:
            df.index.names = [storage_name_mapping.get(name, name)
                              for name in index_names]
    df.columns = [storage_name_mapping.get(col, col)
                  for col in all_columns
                  if col not in (index_names or [])]

    if is_series:
        return df[df.columns[0]]
    else:
        return df
Exemplo n.º 3
0
    def read_partition(fs,
                       piece,
                       columns,
                       index,
                       categories=(),
                       pf=None,
                       **kwargs):
        if isinstance(index, list):
            columns += index

        if pf:
            df = pf.read_row_group_file(piece,
                                        columns,
                                        categories,
                                        index=index,
                                        **kwargs.get("read", {}))
        else:
            base, fns = _analyze_paths([piece], fs)
            scheme = get_file_scheme(fns)
            pf = ParquetFile(piece, open_with=fs.open)
            relpath = piece.replace(base, "").lstrip("/")
            for rg in pf.row_groups:
                for ch in rg.columns:
                    ch.file_path = relpath
            pf.file_scheme = scheme
            pf.cats = _paths_to_cats(fns, scheme)
            pf.fn = base
            df = pf.to_pandas(columns, categories, index=index)

        return df
Exemplo n.º 4
0
    def read_partition(fs,
                       piece,
                       columns,
                       index,
                       categories=(),
                       pf=None,
                       **kwargs):
        if isinstance(index, list):
            columns += index

        if pf is None:
            base, fns = _analyze_paths([piece], fs)
            scheme = get_file_scheme(fns)
            pf = ParquetFile(piece, open_with=fs.open)
            relpath = piece.replace(base, "").lstrip("/")
            for rg in pf.row_groups:
                for ch in rg.columns:
                    ch.file_path = relpath
            pf.file_scheme = scheme
            pf.cats = _paths_to_cats(fns, scheme)
            pf.fn = base
            return pf.to_pandas(columns, categories, index=index)
        else:
            if isinstance(pf, tuple):
                pf = _determine_pf_parts(fs, pf[0], pf[1], **kwargs)[1]
                pf._dtypes = lambda *args: pf.dtypes  # ugly patch, could be fixed
                pf.fmd.row_groups = None
            rg_piece = pf.row_groups[piece]
            pf.fmd.key_value_metadata = None
            return pf.read_row_group_file(rg_piece,
                                          columns,
                                          categories,
                                          index=index,
                                          **kwargs.get("read", {}))
Exemplo n.º 5
0
    def read_partition(
        cls, fs, piece, columns, index, categories=(), pf=None, **kwargs
    ):

        null_index_name = False
        if isinstance(index, list):
            if index == [None]:
                # Handling a None-labeled index...
                # The pandas metadata told us to read in an index
                # labeled `None`. If this corresponds to a `RangeIndex`,
                # fastparquet will need use the pandas metadata to
                # construct the index. Otherwise, the index will correspond
                # to a column named "__index_level_0__".  We will need to
                # check the `ParquetFile` object for this column below.
                index = []
                null_index_name = True
            columns += index

        if pf is None:
            base, fns = _analyze_paths([piece], fs)
            scheme = get_file_scheme(fns)
            pf = ParquetFile(piece, open_with=fs.open)
            relpath = piece.replace(base, "").lstrip("/")
            for rg in pf.row_groups:
                for ch in rg.columns:
                    ch.file_path = relpath
            pf.file_scheme = scheme
            pf.cats = paths_to_cats(fns, scheme)
            pf.fn = base
            if null_index_name and "__index_level_0__" in pf.columns:
                # See "Handling a None-labeled index" comment above
                index = ["__index_level_0__"]
                columns += index
            return pf.to_pandas(columns, categories, index=index)
        else:
            if isinstance(pf, tuple):
                if isinstance(pf[0], list):
                    pf = _determine_pf_parts(fs, pf[0], pf[1], **kwargs)[1]
                else:
                    pf = ParquetFile(
                        pf[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})
                    )
                pf._dtypes = lambda *args: pf.dtypes  # ugly patch, could be fixed
                pf.fmd.row_groups = None
            rg_piece = pf.row_groups[piece]
            if null_index_name:
                if "__index_level_0__" in pf.columns:
                    # See "Handling a None-labeled index" comment above
                    index = ["__index_level_0__"]
                    columns += index
                    pf.fmd.key_value_metadata = None
            else:
                pf.fmd.key_value_metadata = None
            return pf.read_row_group_file(
                rg_piece, columns, categories, index=index, **kwargs.get("read", {})
            )
Exemplo n.º 6
0
def _read_fp_multifile(fs,
                       fs_token,
                       paths,
                       columns=None,
                       categories=None,
                       index=None):
    """Read dataset with fastparquet by assuming metadata from first file"""
    from fastparquet import ParquetFile
    from fastparquet.util import analyse_paths, get_file_scheme, join_path

    base, fns = analyse_paths(paths)
    parsed_paths = [join_path(p) for p in paths]
    scheme = get_file_scheme(fns)
    pf = ParquetFile(paths[0], open_with=fs.open)
    pf.file_scheme = scheme
    pf.cats = _paths_to_cats(fns, scheme)
    (
        meta,
        _,
        index_name,
        out_type,
        all_columns,
        index_names,
        storage_name_mapping,
    ) = _pf_validation(pf, columns, index, categories, [])
    name = "read-parquet-" + tokenize(fs_token, paths, all_columns, categories)
    dsk = {(name, i): (
        _read_pf_simple,
        fs,
        path,
        base,
        index_names,
        all_columns,
        out_type == Series,
        categories,
        pf.cats,
        pf.file_scheme,
        storage_name_mapping,
    )
           for i, path in enumerate(parsed_paths)}
    divisions = (None, ) * (len(paths) + 1)
    return out_type(dsk, name, meta, divisions)
Exemplo n.º 7
0
def _read_fp_multifile(fs, fs_token, paths, columns=None,
                       categories=None, index=None):
    """Read dataset with fastparquet by assuming metadata from first file"""
    from fastparquet import ParquetFile
    from fastparquet.util import analyse_paths, get_file_scheme
    base, fns = analyse_paths(paths)
    scheme = get_file_scheme(fns)
    pf = ParquetFile(paths[0], open_with=fs.open)
    pf.file_scheme = scheme
    pf.cats = _paths_to_cats(fns, scheme)
    (meta, _, index_name, out_type, all_columns, index_names,
     storage_name_mapping) = _pf_validation(
        pf, columns, index, categories, [])
    name = 'read-parquet-' + tokenize(fs_token, paths, all_columns,
                                      categories)
    dsk = {(name, i): (_read_pf_simple, fs, path, base,
                       index_names, all_columns, out_type == Series,
                       categories, pf.cats,
                       pf.file_scheme, storage_name_mapping)
           for i, path in enumerate(paths)}
    divisions = (None, ) * (len(paths) + 1)
    return out_type(dsk, name, meta, divisions)
Exemplo n.º 8
0
def _determine_pf_parts(fs, paths, gather_statistics, **kwargs):
    """ Determine how to access metadata and break read into ``parts``

    This logic is mostly to handle `gather_statistics=False` cases,
    because this also means we should avoid scanning every file in the
    dataset.  If _metadata is available, set `gather_statistics=True`
    (if `gather_statistics=None`).
    """
    parts = []
    if len(paths) > 1:
        if gather_statistics is not False:
            # This scans all the files, allowing index/divisions
            # and filtering
            pf = ParquetFile(paths,
                             open_with=fs.open,
                             sep=fs.sep,
                             **kwargs.get("file", {}))
        else:
            base, fns = _analyze_paths(paths, fs)
            relpaths = [path.replace(base, "").lstrip("/") for path in paths]
            if "_metadata" in relpaths:
                # We have a _metadata file, lets use it
                pf = ParquetFile(base + fs.sep + "_metadata",
                                 open_with=fs.open,
                                 sep=fs.sep,
                                 **kwargs.get("file", {}))
            else:
                # Rely on metadata for 0th file.
                # Will need to pass a list of paths to read_partition
                scheme = get_file_scheme(fns)
                pf = ParquetFile(paths[0],
                                 open_with=fs.open,
                                 **kwargs.get("file", {}))
                pf.file_scheme = scheme
                pf.cats = _paths_to_cats(fns, scheme)
                parts = paths.copy()
    else:
        if fs.isdir(paths[0]):
            # This is a directory, check for _metadata, then _common_metadata
            paths = fs.glob(paths[0] + fs.sep + "*")
            base, fns = _analyze_paths(paths, fs)
            relpaths = [path.replace(base, "").lstrip("/") for path in paths]
            if "_metadata" in relpaths:
                # Using _metadata file (best-case scenario)
                pf = ParquetFile(base + fs.sep + "_metadata",
                                 open_with=fs.open,
                                 sep=fs.sep,
                                 **kwargs.get("file", {}))
                if gather_statistics is None:
                    gather_statistics = True

            elif gather_statistics is not False:
                # Scan every file
                pf = ParquetFile(paths,
                                 open_with=fs.open,
                                 **kwargs.get("file", {}))
            else:
                # Use _common_metadata file if it is available.
                # Otherwise, just use 0th file
                if "_common_metadata" in relpaths:
                    pf = ParquetFile(base + fs.sep + "_common_metadata",
                                     open_with=fs.open,
                                     **kwargs.get("file", {}))
                else:
                    pf = ParquetFile(paths[0],
                                     open_with=fs.open,
                                     **kwargs.get("file", {}))
                scheme = get_file_scheme(fns)
                pf.file_scheme = scheme
                pf.cats = _paths_to_cats(fns, scheme)
                parts = paths.copy()
        else:
            # There is only one file to read
            pf = ParquetFile(paths[0],
                             open_with=fs.open,
                             sep=fs.sep,
                             **kwargs.get("file", {}))

    return parts, pf, gather_statistics
Exemplo n.º 9
0
def _determine_pf_parts(fs, paths, gather_statistics, **kwargs):
    """ Determine how to access metadata and break read into ``parts``

    This logic is mostly to handle `gather_statistics=False` cases,
    because this also means we should avoid scanning every file in the
    dataset.  If _metadata is available, set `gather_statistics=True`
    (if `gather_statistics=None`).

    The `fast_metadata` output specifies that ParquetFile metadata parsing
    is fast enough for each worker to perform during `read_partition`. The
    value will be set to True if: (1) The path is a directory containing
    _metadta, (2) the path is a list of files containing _metadata, (3)
    there is only one file to read, or (4) `gather_statistics` is False.
    In other cases, the ParquetFile object will need to be stored in the
    task graph, because metadata parsing is too expensive.
    """
    parts = []
    fast_metadata = True
    if len(paths) > 1:
        base, fns = _analyze_paths(paths, fs)
        if gather_statistics is not False:
            # This scans all the files, allowing index/divisions
            # and filtering
            if "_metadata" not in fns:
                paths_use = paths
                fast_metadata = False
            else:
                paths_use = base + fs.sep + "_metadata"
            pf = ParquetFile(paths_use,
                             open_with=fs.open,
                             sep=fs.sep,
                             **kwargs.get("file", {}))
        else:
            if "_metadata" in fns:
                # We have a _metadata file, lets use it
                pf = ParquetFile(base + fs.sep + "_metadata",
                                 open_with=fs.open,
                                 sep=fs.sep,
                                 **kwargs.get("file", {}))
            else:
                # Rely on metadata for 0th file.
                # Will need to pass a list of paths to read_partition
                scheme = get_file_scheme(fns)
                pf = ParquetFile(paths[0],
                                 open_with=fs.open,
                                 **kwargs.get("file", {}))
                pf.file_scheme = scheme
                pf.cats = paths_to_cats(fns, scheme)
                parts = paths.copy()
    elif fs.isdir(paths[0]):
        # This is a directory, check for _metadata, then _common_metadata
        paths = fs.glob(paths[0] + fs.sep + "*")
        base, fns = _analyze_paths(paths, fs)
        if "_metadata" in fns:
            # Using _metadata file (best-case scenario)
            pf = ParquetFile(base + fs.sep + "_metadata",
                             open_with=fs.open,
                             sep=fs.sep,
                             **kwargs.get("file", {}))
            if gather_statistics is None:
                gather_statistics = True

        elif gather_statistics is not False:
            # Scan every file
            pf = ParquetFile(paths,
                             open_with=fs.open,
                             **kwargs.get("file", {}))
            fast_metadata = False
        else:
            # Use _common_metadata file if it is available.
            # Otherwise, just use 0th file
            if "_common_metadata" in fns:
                pf = ParquetFile(base + fs.sep + "_common_metadata",
                                 open_with=fs.open,
                                 **kwargs.get("file", {}))
            else:
                pf = ParquetFile(paths[0],
                                 open_with=fs.open,
                                 **kwargs.get("file", {}))
            scheme = get_file_scheme(fns)
            pf.file_scheme = scheme
            pf.cats = paths_to_cats(fns, scheme)
            parts = paths.copy()
    else:
        # There is only one file to read
        base = None
        pf = ParquetFile(paths[0],
                         open_with=fs.open,
                         sep=fs.sep,
                         **kwargs.get("file", {}))

    return parts, pf, gather_statistics, fast_metadata, base
Exemplo n.º 10
0
    def read_partition(cls, fs, piece, columns, index, categories=(),
                       **kwargs):

        null_index_name = False
        if isinstance(index, list):
            if index == [None]:
                # Handling a None-labeled index...
                # The pandas metadata told us to read in an index
                # labeled `None`. If this corresponds to a `RangeIndex`,
                # fastparquet will need use the pandas metadata to
                # construct the index. Otherwise, the index will correspond
                # to a column named "__index_level_0__".  We will need to
                # check the `ParquetFile` object for this column below.
                index = []
                null_index_name = True
            columns += index

        # Use global `parquet_file` object.  Need to reattach
        # the desired row_group
        parquet_file = kwargs.pop("parquet_file", None)

        if isinstance(piece, tuple):
            if isinstance(piece[0], str):
                # We have a path to read from
                assert parquet_file is None
                parquet_file = ParquetFile(piece[0],
                                           open_with=fs.open,
                                           sep=fs.sep,
                                           **kwargs.get("file", {}))
                rg_indices = piece[1] or list(
                    range(len(parquet_file.row_groups)))

                # `piece[1]` will contain row-group indices
                row_groups = [parquet_file.row_groups[rg] for rg in rg_indices]
            elif parquet_file:
                # `piece[1]` will contain actual row-group objects,
                # but they may be pickled
                row_groups = piece[0]
                if isinstance(row_groups, bytes):
                    row_groups = pickle.loads(row_groups)
                parquet_file.fmd.row_groups = row_groups
                # NOTE: May lose cats after `_set_attrs` call
                save_cats = parquet_file.cats
                parquet_file._set_attrs()
                parquet_file.cats = save_cats
            else:
                raise ValueError("Neither path nor ParquetFile detected!")

            if null_index_name:
                if "__index_level_0__" in parquet_file.columns:
                    # See "Handling a None-labeled index" comment above
                    index = ["__index_level_0__"]
                    columns += index

            parquet_file._dtypes = (lambda *args: parquet_file.dtypes
                                    )  # ugly patch, could be fixed

            # Read necessary row-groups and concatenate
            dfs = []
            for row_group in row_groups:
                dfs.append(
                    parquet_file.read_row_group_file(
                        row_group,
                        columns,
                        categories,
                        index=index,
                        **kwargs.get("read", {}),
                    ))
            return concat(dfs, axis=0) if len(dfs) > 1 else dfs[0]

        else:
            # `piece` is NOT a tuple
            raise ValueError(f"Expected tuple, got {type(piece)}")
Exemplo n.º 11
0
def _determine_pf_parts(fs, paths, gather_statistics, **kwargs):
    """Determine how to access metadata and break read into ``parts``

    This logic is mostly to handle `gather_statistics=False` cases,
    because this also means we should avoid scanning every file in the
    dataset.  If _metadata is available, set `gather_statistics=True`
    (if `gather_statistics=None`).
    """
    parts = []
    if len(paths) > 1:
        paths, base, fns = _sort_and_analyze_paths(paths, fs)
        if gather_statistics is not False:
            # This scans all the files, allowing index/divisions
            # and filtering
            if "_metadata" not in fns:
                paths_use = paths
            else:
                paths_use = base + fs.sep + "_metadata"
            pf = ParquetFile(paths_use,
                             open_with=fs.open,
                             sep=fs.sep,
                             **kwargs.get("file", {}))
        else:
            if "_metadata" in fns:
                # We have a _metadata file, lets use it
                pf = ParquetFile(
                    base + fs.sep + "_metadata",
                    open_with=fs.open,
                    sep=fs.sep,
                    **kwargs.get("file", {}),
                )
            else:
                # Rely on metadata for 0th file.
                # Will need to pass a list of paths to read_partition
                scheme = get_file_scheme(fns)
                pf = ParquetFile(paths[0],
                                 open_with=fs.open,
                                 **kwargs.get("file", {}))
                pf.file_scheme = scheme
                pf.cats = paths_to_cats(fns, scheme)
                parts = paths.copy()
    elif fs.isdir(paths[0]):
        # This is a directory, check for _metadata, then _common_metadata
        paths = fs.glob(paths[0] + fs.sep + "*")
        paths, base, fns = _sort_and_analyze_paths(paths, fs)
        if "_metadata" in fns:
            # Using _metadata file (best-case scenario)
            pf = ParquetFile(
                base + fs.sep + "_metadata",
                open_with=fs.open,
                sep=fs.sep,
                **kwargs.get("file", {}),
            )
            if gather_statistics is None:
                gather_statistics = True

        elif gather_statistics is not False:
            # Scan every file
            pf = ParquetFile(paths,
                             open_with=fs.open,
                             **kwargs.get("file", {}))
        else:
            # Use _common_metadata file if it is available.
            # Otherwise, just use 0th file
            if "_common_metadata" in fns:
                pf = ParquetFile(
                    base + fs.sep + "_common_metadata",
                    open_with=fs.open,
                    **kwargs.get("file", {}),
                )
                fns.remove("_common_metadata")
            else:
                pf = ParquetFile(paths[0],
                                 open_with=fs.open,
                                 **kwargs.get("file", {}))
            scheme = get_file_scheme(fns)
            pf.file_scheme = scheme
            pf.cats = paths_to_cats(fns, scheme)
            parts = [fs.sep.join([base, fn]) for fn in fns]
    else:
        # There is only one file to read
        base = None
        pf = ParquetFile(paths[0],
                         open_with=fs.open,
                         sep=fs.sep,
                         **kwargs.get("file", {}))

    # Ensure that there is no overlap between partition columns
    # and explicit columns in `pf`
    if pf.cats:
        _partitions = [p for p in pf.cats if p not in pf.columns]
        if not _partitions:
            pf.cats = {}
        elif len(_partitions) != len(pf.cats):
            raise ValueError("No partition-columns should be written in the \n"
                             "file unless they are ALL written in the file.\n"
                             "columns: {} | partitions: {}".format(
                                 pf.columns, pf.cats.keys()))

    return parts, pf, gather_statistics, base
Exemplo n.º 12
0
    def read_metadata(fs,
                      paths,
                      categories=None,
                      index=None,
                      gather_statistics=None,
                      filters=None,
                      **kwargs):
        if len(paths) > 1:
            if gather_statistics is not False:
                # this scans all the files, allowing index/divisions
                # and filtering
                pf = fastparquet.ParquetFile(paths,
                                             open_with=fs.open,
                                             sep=fs.sep,
                                             **kwargs.get("file", {}))
            else:
                base, fns = analyse_paths(paths)
                scheme = get_file_scheme(fns)
                pf = ParquetFile(paths[0],
                                 open_with=fs.open,
                                 **kwargs.get("file", {}))
                pf.file_scheme = scheme
                pf.cats = _paths_to_cats(fns, scheme)
                relpath = paths.replace(base, "").lstrip("/")
                for rg in pf.row_groups:
                    rg.cats = pf.cats
                    rg.schema = pf.schema
                    for ch in rg.columns:
                        ch.file_path = relpath
        else:
            try:
                pf = fastparquet.ParquetFile(paths[0] + fs.sep + "_metadata",
                                             open_with=fs.open,
                                             sep=fs.sep,
                                             **kwargs.get("file", {}))
                if gather_statistics is None:
                    gather_statistics = True
            except Exception:
                pf = fastparquet.ParquetFile(paths[0],
                                             open_with=fs.open,
                                             sep=fs.sep,
                                             **kwargs.get("file", {}))

        columns = None
        if pf.fmd.key_value_metadata:
            pandas_md = [
                x.value for x in pf.fmd.key_value_metadata if x.key == "pandas"
            ]
        else:
            pandas_md = []

        if len(pandas_md) == 0:
            index_names = []
            column_names = pf.columns + list(pf.cats)
            storage_name_mapping = {k: k for k in column_names}
            column_index_names = [None]

        elif len(pandas_md) == 1:
            (
                index_names,
                column_names,
                storage_name_mapping,
                column_index_names,
            ) = _parse_pandas_metadata(json.loads(pandas_md[0]))
            #  auto-ranges should not be created by fastparquet
            index_names = [n for n in index_names if n is not None]
            column_names.extend(pf.cats)

        else:
            raise ValueError("File has multiple entries for 'pandas' metadata")

        if index is None and len(index_names) > 0:
            if len(index_names) == 1:
                index = index_names[0]
            else:
                index = index_names

        # Normalize user inputs
        column_names, index_names = _normalize_index_columns(
            columns, column_names, index, index_names)

        all_columns = index_names + column_names

        categories_dict = None
        if isinstance(categories, dict):
            categories_dict = categories

        if categories is None:
            categories = pf.categories
        elif isinstance(categories, string_types):
            categories = [categories]
        else:
            categories = list(categories)

        # Check that categories are included in columns
        if categories and not set(categories).intersection(all_columns):
            raise ValueError("categories not in available columns.\n"
                             "categories: {} | columns: {}".format(
                                 categories, list(all_columns)))

        dtypes = pf._dtypes(categories)
        dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()}

        index_cols = index or ()
        meta = _meta_from_dtypes(all_columns, dtypes, index_cols,
                                 column_index_names)

        # fastparquet doesn't handle multiindex
        if len(index_names) > 1:
            raise ValueError("Cannot read DataFrame with MultiIndex.")

        for cat in categories:
            if cat in meta:
                meta[cat] = pd.Series(
                    pd.Categorical([], categories=[UNKNOWN_CATEGORIES]),
                    index=meta.index,
                )

        for catcol in pf.cats:
            if catcol in meta.columns:
                meta[catcol] = meta[catcol].cat.set_categories(pf.cats[catcol])
            elif meta.index.name == catcol:
                meta.index = meta.index.set_categories(pf.cats[catcol])

        if gather_statistics and pf.row_groups:
            stats = []
            if filters is None:
                filters = []
            # make statistics conform in layout
            for (i, row_group) in enumerate(pf.row_groups):
                s = {"num-rows": row_group.num_rows, "columns": []}
                for col in pf.columns:
                    d = {"name": col}
                    if pf.statistics["min"][col][0] is not None:
                        cs_min = pf.statistics["min"][col][i]
                        cs_max = pf.statistics["max"][col][i]
                        if isinstance(cs_min, np.datetime64):
                            cs_min = pd.Timestamp(cs_min)
                            cs_max = pd.Timestamp(cs_max)
                        d.update({
                            "min":
                            cs_min,
                            "max":
                            cs_max,
                            "null_count":
                            pf.statistics["null_count"][col][i],
                        })
                    s["columns"].append(d)
                # Need this to filter out partitioned-on categorical columns
                s["filter"] = fastparquet.api.filter_out_cats(
                    row_group, filters)
                stats.append(s)

        else:
            stats = None

        pf._dtypes = lambda *args: pf.dtypes  # ugly patch, could be fixed
        pf.fmd.row_groups = None

        # Create `parts` (list of row-group-descriptor dicts)
        parts = [{
            "piece": rg,
            "kwargs": {
                "pf": pf,
                "categories": categories_dict or categories
            },
        } for rg in pf.row_groups]

        return (meta, stats, parts)
Exemplo n.º 13
0
    def read_partition(
        cls,
        fs,
        pieces,
        columns,
        index,
        categories=(),
        root_cats=None,
        root_file_scheme=None,
        base_path=None,
        **kwargs,
    ):

        null_index_name = False
        base_path = False if not root_cats else base_path
        if isinstance(index, list):
            if index == [None]:
                # Handling a None-labeled index...
                # The pandas metadata told us to read in an index
                # labeled `None`. If this corresponds to a `RangeIndex`,
                # fastparquet will need use the pandas metadata to
                # construct the index. Otherwise, the index will correspond
                # to a column named "__index_level_0__".  We will need to
                # check the `ParquetFile` object for this column below.
                index = []
                null_index_name = True
            columns += index

        # Use global `parquet_file` object.  Need to reattach
        # the desired row_group
        parquet_file = kwargs.pop("parquet_file", None)

        # Always convert pieces to list
        if not isinstance(pieces, list):
            pieces = [pieces]

        sample = pieces[0]
        if isinstance(sample, tuple):
            if isinstance(sample[0], str):
                # We have paths to read from
                assert parquet_file is None

                row_groups = []
                rg_offset = 0
                parquet_file = ParquetFile(
                    [p[0] for p in pieces],
                    open_with=fs.open,
                    root=base_path or False,
                    **kwargs.get("file", {}),
                )
                for piece in pieces:
                    _pf = (parquet_file if len(pieces) == 1 else ParquetFile(
                        piece[0],
                        open_with=fs.open,
                        root=base_path or False,
                        **kwargs.get("file", {}),
                    ))
                    n_local_row_groups = len(_pf.row_groups)
                    local_rg_indices = piece[1] or list(
                        range(n_local_row_groups))
                    row_groups += [
                        parquet_file.row_groups[rg + rg_offset]
                        for rg in local_rg_indices
                    ]
                    rg_offset += n_local_row_groups
                update_parquet_file = len(row_groups) < len(
                    parquet_file.row_groups)

            elif parquet_file:

                row_groups = []
                for piece in pieces:
                    # `piece[1]` will contain actual row-group objects,
                    # but they may be pickled
                    rgs = piece[0]
                    if isinstance(rgs, bytes):
                        rgs = pickle.loads(rgs)
                    row_groups += rgs
                update_parquet_file = True

            else:
                raise ValueError("Neither path nor ParquetFile detected!")

            if update_parquet_file:
                with _FP_FILE_LOCK:
                    parquet_file.fmd.row_groups = row_groups
                    # NOTE: May lose cats after `_set_attrs` call
                    save_cats = parquet_file.cats
                    parquet_file._set_attrs()
                    parquet_file.cats = save_cats

            if null_index_name:
                if "__index_level_0__" in parquet_file.columns:
                    # See "Handling a None-labeled index" comment above
                    index = ["__index_level_0__"]
                    columns += index

            # Update hive-partitioning information if necessary
            parquet_file.cats = root_cats or {}
            if root_cats:
                parquet_file.file_scheme = root_file_scheme

            parquet_file._dtypes = (lambda *args: parquet_file.dtypes
                                    )  # ugly patch, could be fixed

            if set(columns).issubset(parquet_file.columns +
                                     list(parquet_file.cats.keys())):
                # Convert ParquetFile to pandas
                return parquet_file.to_pandas(
                    columns=columns,
                    categories=categories,
                    index=index,
                )
            else:
                # Read necessary row-groups and concatenate
                dfs = []
                for row_group in row_groups:
                    dfs.append(
                        parquet_file.read_row_group_file(
                            row_group,
                            columns,
                            categories,
                            index=index,
                            **kwargs.get("read", {}),
                        ))
                return concat(dfs, axis=0) if len(dfs) > 1 else dfs[0]

        else:
            # `sample` is NOT a tuple
            raise ValueError(f"Expected tuple, got {type(sample)}")
Exemplo n.º 14
0
    def _collect_file_parts(
        cls,
        pf_or_files,
        dataset_info_kwargs,
    ):

        # Collect necessary information from dataset_info
        fs = dataset_info_kwargs["fs"]
        split_row_groups = dataset_info_kwargs["split_row_groups"]
        gather_statistics = dataset_info_kwargs["gather_statistics"]
        stat_col_indices = dataset_info_kwargs["stat_col_indices"]
        filters = dataset_info_kwargs["filters"]
        dtypes = dataset_info_kwargs["dtypes"]
        chunksize = dataset_info_kwargs["chunksize"]
        aggregation_depth = dataset_info_kwargs["aggregation_depth"]
        base_path = dataset_info_kwargs.get("base_path", None)
        root_cats = dataset_info_kwargs.get("root_cats", None)
        root_file_scheme = dataset_info_kwargs.get("root_file_scheme", None)
        has_metadata_file = dataset_info_kwargs["has_metadata_file"]

        # Get ParquetFile
        if not isinstance(pf_or_files, fastparquet.api.ParquetFile):
            # Construct local `ParquetFile` object
            pf = ParquetFile(
                pf_or_files,
                open_with=fs.open,
                root=base_path,
            )
            # Update hive-partitioning to match global cats/scheme
            pf.cats = root_cats or {}
            if root_cats:
                pf.file_scheme = root_file_scheme
        else:
            # We already have a ParquetFile object to work with
            pf = pf_or_files

        # Organize row-groups by file
        (
            file_row_groups,
            file_row_group_stats,
            file_row_group_column_stats,
            gather_statistics,
            base_path,
        ) = cls._organize_row_groups(
            pf,
            split_row_groups,
            gather_statistics,
            stat_col_indices,
            filters,
            dtypes,
            base_path,
            has_metadata_file,
            chunksize,
            aggregation_depth,
        )

        # Convert organized row-groups to parts
        parts, stats = _row_groups_to_parts(
            gather_statistics,
            split_row_groups,
            aggregation_depth,
            file_row_groups,
            file_row_group_stats,
            file_row_group_column_stats,
            stat_col_indices,
            cls._make_part,
            make_part_kwargs={
                "fs": fs,
                "pf": pf,
                "base_path": base_path,
                "partitions": pf.info.get("partitions", None),
            },
        )

        return parts, stats
Exemplo n.º 15
0
    def _collect_dataset_info(
            cls,
            paths,
            fs,
            categories,
            index,
            gather_statistics,
            filters,
            split_row_groups,
            chunksize,
            aggregate_files,
            ignore_metadata_file,
            metadata_task_size,
            require_extension=(".parq", ".parquet"),
            **kwargs,
    ):

        # Define the parquet-file (pf) object to use for metadata,
        # Also, initialize `parts`.  If `parts` is populated here,
        # then each part will correspond to a file.  Otherwise, each part will
        # correspond to a row group (populated later).
        #
        # This logic is mostly to handle `gather_statistics=False` cases,
        # because this also means we should avoid scanning every file in the
        # dataset.  If _metadata is available, set `gather_statistics=True`
        # (if `gather_statistics=None`).

        parts = []
        _metadata_exists = False
        if len(paths) == 1 and fs.isdir(paths[0]):

            # This is a directory.
            # Check if _metadata and/or _common_metadata files exists
            base = paths[0]
            _metadata_exists = True
            if not ignore_metadata_file:
                _metadata_exists = fs.isfile(fs.sep.join([base, "_metadata"]))

            # Find all files if we are not using a _metadata file
            if ignore_metadata_file or not _metadata_exists:
                # For now, we need to discover every file under paths[0]
                paths, base, fns = _sort_and_analyze_paths(fs.find(base), fs)
                _update_paths = False
                for fn in ["_metadata", "_common_metadata"]:
                    try:
                        fns.remove(fn)
                        _update_paths = True
                    except ValueError:
                        pass
                if _update_paths:
                    paths = [fs.sep.join([base, fn]) for fn in fns]
                _metadata_exists = False
            if _metadata_exists:
                # Using _metadata file (best-case scenario)
                pf = ParquetFile(
                    fs.sep.join([base, "_metadata"]),
                    open_with=fs.open,
                    **kwargs,
                )
                if gather_statistics is None:
                    gather_statistics = True
            else:
                # Use 0th file
                # Note that "_common_metadata" can cause issues for
                # partitioned datasets.
                if require_extension:
                    # Raise error if all files have been filtered by extension
                    len0 = len(paths)
                    paths = [
                        path for path in paths
                        if path.endswith(require_extension)
                    ]
                    if len0 and paths == []:
                        raise ValueError(
                            "No files satisfy the `require_extension` criteria "
                            f"(files must end with {require_extension}).")
                pf = ParquetFile(paths[:1],
                                 open_with=fs.open,
                                 root=base,
                                 **kwargs)
                scheme = get_file_scheme(fns)
                pf.file_scheme = scheme
                pf.cats = paths_to_cats(fns, scheme)
                if not gather_statistics:
                    parts = [fs.sep.join([base, fn]) for fn in fns]
        else:
            # This is a list of files
            paths, base, fns = _sort_and_analyze_paths(paths, fs)

            # Check if _metadata is in paths, and
            # remove it if ignore_metadata_file=True
            _metadata_exists = "_metadata" in fns
            if _metadata_exists and ignore_metadata_file:
                fns.remove("_metadata")
                _metadata_exists = False
            paths = [fs.sep.join([base, fn]) for fn in fns]

            if _metadata_exists:
                # We have a _metadata file, lets use it
                pf = ParquetFile(
                    fs.sep.join([base, "_metadata"]),
                    open_with=fs.open,
                    **kwargs,
                )
            else:
                # Rely on metadata for 0th file.
                # Will need to pass a list of paths to read_partition
                scheme = get_file_scheme(fns)
                pf = ParquetFile(paths[:1],
                                 open_with=fs.open,
                                 root=base,
                                 **kwargs)
                pf.file_scheme = scheme
                pf.cats = paths_to_cats(fns, scheme)
                if not gather_statistics:
                    parts = paths.copy()

        # Check the `aggregate_files` setting
        aggregation_depth = _get_aggregation_depth(
            aggregate_files,
            list(pf.cats),
        )

        # Ensure that there is no overlap between partition columns
        # and explicit columns in `pf`
        if pf.cats:
            _partitions = [p for p in pf.cats if p not in pf.columns]
            if not _partitions:
                pf.cats = {}
            elif len(_partitions) != len(pf.cats):
                raise ValueError(
                    "No partition-columns should be written in the \n"
                    "file unless they are ALL written in the file.\n"
                    "columns: {} | partitions: {}".format(
                        pf.columns, pf.cats.keys()))

        return {
            "pf": pf,
            "paths": paths,
            "has_metadata_file": _metadata_exists,
            "parts": parts,
            "base": base,
            "fs": fs,
            "gather_statistics": gather_statistics,
            "categories": categories,
            "index": index,
            "filters": filters,
            "split_row_groups": split_row_groups,
            "chunksize": chunksize,
            "aggregate_files": aggregate_files,
            "aggregation_depth": aggregation_depth,
            "metadata_task_size": metadata_task_size,
            "kwargs": kwargs,
        }