예제 #1
0
def _read_parquet_row_group(fs, fn, index, columns, rg, series, categories,
                            schema, cs, dt, scheme, storage_name_mapping, *args):
    from fastparquet.api import _pre_allocate
    from fastparquet.core import read_row_group_file
    name_storage_mapping = {v: k for k, v in storage_name_mapping.items()}
    if not isinstance(columns, (tuple, list)):
        columns = [columns,]
        series = True
    if index:
        index, = index
        if index not in columns:
            columns = columns + [index]

    columns = [name_storage_mapping.get(col, col) for col in columns]
    index = name_storage_mapping.get(index, index)

    df, views = _pre_allocate(rg.num_rows, columns, categories, index, cs, dt)
    read_row_group_file(fn, rg, columns, categories, schema, cs,
                        open=fs.open, assign=views, scheme=scheme)

    if df.index.nlevels == 1:
        if index:
            df.index.name = storage_name_mapping.get(index, index)
    else:
        if index:
            df.index.names = [storage_name_mapping.get(name, name)
                              for name in index]
    df.columns = [storage_name_mapping.get(col, col)
                  for col in columns
                  if col != index]

    if series:
        return df[df.columns[0]]
    else:
        return df
예제 #2
0
파일: parquet.py 프로젝트: fortizc/dask
def _read_parquet_row_group(open, fn, index, columns, rg, series, categories,
                            schema, cs, dt, scheme, storage_name_mapping, *args):
    from fastparquet.api import _pre_allocate
    from fastparquet.core import read_row_group_file
    name_storage_mapping = {v: k for k, v in storage_name_mapping.items()}
    if not isinstance(columns, (tuple, list)):
        columns = [columns,]
        series = True
    if index:
        index, = index
        if index not in columns:
            columns = columns + [index]

    columns = [name_storage_mapping.get(col, col) for col in columns]
    index = name_storage_mapping.get(index, index)

    df, views = _pre_allocate(rg.num_rows, columns, categories, index, cs, dt)
    read_row_group_file(fn, rg, columns, categories, schema, cs,
                        open=open, assign=views, scheme=scheme)

    if df.index.nlevels == 1:
        if index:
            df.index.name = storage_name_mapping.get(index, index)
    else:
        if index:
            df.index.names = [storage_name_mapping.get(name, name)
                              for name in index]
    df.columns = [storage_name_mapping.get(col, col)
                  for col in columns
                  if col != index]

    if series:
        return df[df.columns[0]]
    else:
        return df
예제 #3
0
def _read_parquet_row_group(open, fn, index, columns, rg, series, categories,
                            schema, cs, dt, *args):
    if not isinstance(columns, (tuple, list)):
        columns = (columns,)
        series = True
    if index and index not in columns:
        columns = columns + type(columns)([index])
    df, views = _pre_allocate(rg.num_rows, columns, categories, index, cs, dt)
    read_row_group_file(fn, rg, columns, categories, schema, cs,
                        open=open, assign=views)

    if series:
        return df[df.columns[0]]
    else:
        return df
예제 #4
0
파일: parquet.py 프로젝트: rlugojr/dask
def _read_parquet_row_group(open, fn, index, columns, rg, series, categories,
                            helper, cs, dt, *args):
    if not isinstance(columns, (tuple, list)):
        columns = (columns,)
        series = True
    if index and index not in columns:
        columns = columns + type(columns)([index])
    df, views = _pre_allocate(rg.num_rows, columns, categories, index, cs, dt)
    read_row_group_file(fn, rg, columns, categories, helper, cs,
                        open=open, assign=views)

    if series:
        return df[df.columns[0]]
    else:
        return df
예제 #5
0
파일: parquet.py 프로젝트: postelrich/dask
def _read_parquet_row_group(open, fn, index, columns, rg, series, categories,
                            schema, cs, dt, scheme, *args):
    from fastparquet.api import _pre_allocate
    from fastparquet.core import read_row_group_file
    if not isinstance(columns, (tuple, list)):
        columns = (columns,)
        series = True
    if index and index not in columns:
        columns = columns + type(columns)([index])
    df, views = _pre_allocate(rg.num_rows, columns, categories, index, cs, dt)
    read_row_group_file(fn, rg, columns, categories, schema, cs,
                        open=open, assign=views, scheme=scheme)

    if series:
        return df[df.columns[0]]
    else:
        return df