Exemplo n.º 1
0
def _read_parquet(fname, start, end, cols=None):
    pf = ParquetFile(fname)
    if cols is None:
        cols = pf.columns
    i = 0
    df_set = []
    for rg in pf.row_groups:
        last_idx_in_rg = i + rg.num_rows - 1
        if start <= last_idx_in_rg:
            f = pf.open(pf.fn)
            df = pf.read_row_group(rg, cols, pf.categories, infile=f)
            filters = []
            if start > i:
                filters.append(df.index >= (start - i))
            if end is not None and end < last_idx_in_rg:
                filters.append(df.index < (end - i))
            if filters:
                _LOG.warning('unaligned chunk fname:[%s] start:[%s] end:[%s]',
                             fname, start, end)
                df = df[functools.reduce(operator.and_, filters)]
            df_set.append(df)
        i += rg.num_rows
        if end is not None and i >= end:
            break
    return df_set
Exemplo n.º 2
0
 def chunk(self):
     pf = ParquetFile(self.path)
     rg = pf.row_groups[0]
     df = pf.read_row_group(rg,
                            pf.columns,
                            categories=pf.categories,
                            infile=pf.open(pf.fn))
     return df