Exemplo n.º 1
0
    def read_partition(fs,
                       piece,
                       columns,
                       index,
                       categories=(),
                       pf=None,
                       **kwargs):
        if isinstance(index, list):
            columns += index

        if pf:
            df = pf.read_row_group_file(piece,
                                        columns,
                                        categories,
                                        index=index,
                                        **kwargs.get("read", {}))
        else:
            base, fns = _analyze_paths([piece], fs)
            scheme = get_file_scheme(fns)
            pf = ParquetFile(piece, open_with=fs.open)
            relpath = piece.replace(base, "").lstrip("/")
            for rg in pf.row_groups:
                for ch in rg.columns:
                    ch.file_path = relpath
            pf.file_scheme = scheme
            pf.cats = _paths_to_cats(fns, scheme)
            pf.fn = base
            df = pf.to_pandas(columns, categories, index=index)

        return df
Exemplo n.º 2
0
def _read_pf_simple(fs, path, base, index_names, all_columns, is_series,
                    categories, cats, scheme, storage_name_mapping):
    """Read dataset with fastparquet using ParquetFile machinery"""
    from fastparquet import ParquetFile
    pf = ParquetFile(path, open_with=fs.open)
    relpath = path.replace(base, '').lstrip('/')
    for rg in pf.row_groups:
        for ch in rg.columns:
            ch.file_path = relpath
    pf.file_scheme = scheme
    pf.cats = cats
    pf.fn = base
    df = pf.to_pandas(all_columns, categories, index=index_names)
    if df.index.nlevels == 1:
        if index_names:
            df.index.name = storage_name_mapping.get(index_names[0],
                                                     index_names[0])
    else:
        if index_names:
            df.index.names = [storage_name_mapping.get(name, name)
                              for name in index_names]
    df.columns = [storage_name_mapping.get(col, col)
                  for col in all_columns
                  if col not in (index_names or [])]

    if is_series:
        return df[df.columns[0]]
    else:
        return df
Exemplo n.º 3
0
    def read_partition(fs,
                       piece,
                       columns,
                       index,
                       categories=(),
                       pf=None,
                       **kwargs):
        if isinstance(index, list):
            columns += index

        if pf is None:
            base, fns = _analyze_paths([piece], fs)
            scheme = get_file_scheme(fns)
            pf = ParquetFile(piece, open_with=fs.open)
            relpath = piece.replace(base, "").lstrip("/")
            for rg in pf.row_groups:
                for ch in rg.columns:
                    ch.file_path = relpath
            pf.file_scheme = scheme
            pf.cats = _paths_to_cats(fns, scheme)
            pf.fn = base
            return pf.to_pandas(columns, categories, index=index)
        else:
            if isinstance(pf, tuple):
                pf = _determine_pf_parts(fs, pf[0], pf[1], **kwargs)[1]
                pf._dtypes = lambda *args: pf.dtypes  # ugly patch, could be fixed
                pf.fmd.row_groups = None
            rg_piece = pf.row_groups[piece]
            pf.fmd.key_value_metadata = None
            return pf.read_row_group_file(rg_piece,
                                          columns,
                                          categories,
                                          index=index,
                                          **kwargs.get("read", {}))
Exemplo n.º 4
0
def _read_pf_simple(fs, path, base, index_names, all_columns, is_series,
                    categories, cats, scheme, storage_name_mapping):
    """Read dataset with fastparquet using ParquetFile machinery"""
    from fastparquet import ParquetFile
    pf = ParquetFile(path, open_with=fs.open)
    relpath = path.replace(base, '').lstrip('/')
    for rg in pf.row_groups:
        for ch in rg.columns:
            ch.file_path = relpath
    pf.file_scheme = scheme
    pf.cats = cats
    pf.fn = base
    df = pf.to_pandas(all_columns, categories, index=index_names)
    if df.index.nlevels == 1:
        if index_names:
            df.index.name = storage_name_mapping.get(index_names[0],
                                                     index_names[0])
    else:
        if index_names:
            df.index.names = [storage_name_mapping.get(name, name)
                              for name in index_names]
    df.columns = [storage_name_mapping.get(col, col)
                  for col in all_columns
                  if col not in (index_names or [])]

    if is_series:
        return df[df.columns[0]]
    else:
        return df
Exemplo n.º 5
0
    def read_partition(
        cls, fs, piece, columns, index, categories=(), pf=None, **kwargs
    ):

        null_index_name = False
        if isinstance(index, list):
            if index == [None]:
                # Handling a None-labeled index...
                # The pandas metadata told us to read in an index
                # labeled `None`. If this corresponds to a `RangeIndex`,
                # fastparquet will need use the pandas metadata to
                # construct the index. Otherwise, the index will correspond
                # to a column named "__index_level_0__".  We will need to
                # check the `ParquetFile` object for this column below.
                index = []
                null_index_name = True
            columns += index

        if pf is None:
            base, fns = _analyze_paths([piece], fs)
            scheme = get_file_scheme(fns)
            pf = ParquetFile(piece, open_with=fs.open)
            relpath = piece.replace(base, "").lstrip("/")
            for rg in pf.row_groups:
                for ch in rg.columns:
                    ch.file_path = relpath
            pf.file_scheme = scheme
            pf.cats = paths_to_cats(fns, scheme)
            pf.fn = base
            if null_index_name and "__index_level_0__" in pf.columns:
                # See "Handling a None-labeled index" comment above
                index = ["__index_level_0__"]
                columns += index
            return pf.to_pandas(columns, categories, index=index)
        else:
            if isinstance(pf, tuple):
                if isinstance(pf[0], list):
                    pf = _determine_pf_parts(fs, pf[0], pf[1], **kwargs)[1]
                else:
                    pf = ParquetFile(
                        pf[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})
                    )
                pf._dtypes = lambda *args: pf.dtypes  # ugly patch, could be fixed
                pf.fmd.row_groups = None
            rg_piece = pf.row_groups[piece]
            if null_index_name:
                if "__index_level_0__" in pf.columns:
                    # See "Handling a None-labeled index" comment above
                    index = ["__index_level_0__"]
                    columns += index
                    pf.fmd.key_value_metadata = None
            else:
                pf.fmd.key_value_metadata = None
            return pf.read_row_group_file(
                rg_piece, columns, categories, index=index, **kwargs.get("read", {})
            )
Exemplo n.º 6
0
def _read_parquet_file(
    fs,
    base,
    fn,
    index,
    columns,
    series,
    categories,
    cs,
    dt,
    scheme,
    storage_name_mapping,
    *args
):
    """Read a single file with fastparquet, to be used in a task"""
    from fastparquet.api import ParquetFile
    from collections import OrderedDict

    name_storage_mapping = {v: k for k, v in storage_name_mapping.items()}
    if not isinstance(columns, (tuple, list)):
        columns = [columns]
        series = True
    if index:
        index, = index
        if index not in columns:
            columns = columns + [index]
    columns = [name_storage_mapping.get(col, col) for col in columns]
    index = name_storage_mapping.get(index, index)
    cs = OrderedDict([(k, v) for k, v in cs.items() if k in columns])
    pf = ParquetFile(fn, open_with=fs.open)
    pf.file_scheme = scheme
    for rg in pf.row_groups:
        for ch in rg.columns:
            ch.file_path = fn.replace(base, "").lstrip("/")
    pf.fn = base
    df = pf.to_pandas(columns=columns, index=index, categories=categories)

    if df.index.nlevels == 1:
        if index:
            df.index.name = storage_name_mapping.get(index, index)
    else:
        if index:
            df.index.names = [storage_name_mapping.get(name, name) for name in index]
    df.columns = [storage_name_mapping.get(col, col) for col in columns if col != index]

    if series:
        return df[df.columns[0]]
    else:
        return df
Exemplo n.º 7
0
def _read_parquet_file(fs, base, fn, index, columns, series, categories,
                       cs, dt, scheme, storage_name_mapping, *args):
    """Read a single file with fastparquet, to be used in a task"""
    from fastparquet.api import ParquetFile
    from collections import OrderedDict

    name_storage_mapping = {v: k for k, v in storage_name_mapping.items()}
    if not isinstance(columns, (tuple, list)):
        columns = [columns,]
        series = True
    if index:
        index, = index
        if index not in columns:
            columns = columns + [index]
    columns = [name_storage_mapping.get(col, col) for col in columns]
    index = name_storage_mapping.get(index, index)
    cs = OrderedDict([(k, v) for k, v in cs.items() if k in columns])
    pf = ParquetFile(fn, open_with=fs.open)
    pf.file_scheme = scheme
    for rg in pf.row_groups:
        for ch in rg.columns:
            ch.file_path = fn.replace(base, "").lstrip('/')
    pf.fn = base
    df = pf.to_pandas(columns=columns, index=index, categories=categories)

    if df.index.nlevels == 1:
        if index:
            df.index.name = storage_name_mapping.get(index, index)
    else:
        if index:
            df.index.names = [storage_name_mapping.get(name, name)
                              for name in index]
    df.columns = [storage_name_mapping.get(col, col)
                  for col in columns
                  if col != index]

    if series:
        return df[df.columns[0]]
    else:
        return df