예제 #1
0
파일: _csv.py 프로젝트: xhochy/kartothek
    def restore_dataframe(
        store: KeyValueStore,
        key: str,
        filter_query: Optional[str] = None,
        columns: Optional[Iterable[str]] = None,
        predicate_pushdown_to_io: Any = None,
        categories: Optional[Iterable[str]] = None,
        predicates: Optional[PredicatesType] = None,
        date_as_object: Any = None,
        **kwargs,
    ):
        check_predicates(predicates)
        compression: Optional[str]
        if key.endswith(".csv.gz"):
            compression = "gzip"
        elif key.endswith(".csv"):
            compression = None

        if (not columns) and (columns is not None):
            # pd.read_csv does not seem to support proper reads w/o columns (it returns a DF w/o any row)
            columns = None
            project_to_no_cols = True
        else:
            project_to_no_cols = False

        dtype: Optional[Dict[str, str]]
        if categories:
            dtype = {cat: "category" for cat in categories}
        else:
            dtype = None

        try:
            df = pd.read_csv(
                BytesIO(store.get(key)),
                compression=compression,
                sep=";",
                encoding="utf-8",
                usecols=columns,
                dtype=dtype,
            )
            if project_to_no_cols:
                df = df[[]]
            if len(df) == 0:
                # in that case, Pandas decided to use a weird index type, let's fix that
                df.index = pd.RangeIndex(start=0, stop=0, step=1)
        except EmptyDataError:
            df = pd.DataFrame()

        if predicates:
            return filter_df_from_predicates(df, predicates)
        else:
            return filter_df(df, filter_query)