def _read_parquet_with_type_promotion_override(chunk, columns, parquet_engine): """ This wrapper function of pd.read_parquet() is a hack intended to fix the type promotion problem when using fastparquet as the underlying parquet engine. When using fastparquet, boolean columns containing None values will be promoted to float16 columns. This behavior is inconsistent with what Pandas and Pyarrow does, which promote such columns to object columns. This becomes problematic when users want to write the dataframe back into parquet file because float16 (halffloat) is not a supported type in parquet spec. In this function, we detect such columns and do override the type promotion. """ df = None if parquet_engine == 'fastparquet': from fastparquet import ParquetFile as _ParquetFile import fastparquet.thrift_structures as _ts # https://github.com/dask/fastparquet/issues/414#issuecomment-478983811 df = _pd.read_parquet(chunk, columns=columns, engine=parquet_engine, index=False) df_column_types = df.dtypes pf = _ParquetFile(chunk) schema_column_dtypes = {l.name: l.type for l in list(pf.schema.schema_elements)} for idx in df_column_types[df_column_types == 'float16'].index.tolist(): # A hacky way to get the string representations of the column types of a parquet schema # Reference: # https://github.com/dask/fastparquet/blob/f4ecc67f50e7bf98b2d0099c9589c615ea4b06aa/fastparquet/schema.py if _ts.parquet_thrift.Type._VALUES_TO_NAMES[schema_column_dtypes[idx]] == "BOOLEAN": df[idx] = df[idx].astype('object') df[idx].replace({0: False, 1: True, _pd.np.nan: None}, inplace=True) else: df = _pd.read_parquet(chunk, columns=columns, engine=parquet_engine) return df
def _read(self, chunk: os.PathLike, columns: typing.List[str], **kwargs) -> pandas.DataFrame: from fastparquet import ParquetFile as _ParquetFile from fastparquet import thrift_structures as _ts # TODO Follow up to figure out if this is not needed anymore # https://github.com/dask/fastparquet/issues/414#issuecomment-478983811 df = pandas.read_parquet(chunk, columns=columns, engine=self.PARQUET_ENGINE, index=False) df_column_types = df.dtypes pf = _ParquetFile(chunk) schema_column_dtypes = { l.name: l.type for l in list(pf.schema.schema_elements) } for idx in df_column_types[df_column_types == "float16"].index.tolist(): # A hacky way to get the string representations of the column types of a parquet schema # Reference: # https://github.com/dask/fastparquet/blob/f4ecc67f50e7bf98b2d0099c9589c615ea4b06aa/fastparquet/schema.py if _ts.parquet_thrift.Type._VALUES_TO_NAMES[ schema_column_dtypes[idx]] == "BOOLEAN": df[idx] = df[idx].astype("object") df[idx].replace({ 0: False, 1: True, pandas.np.nan: None }, inplace=True) return df
def _read(self, chunk: os.PathLike, columns: typing.List[str], **kwargs) -> pandas.DataFrame: return pandas.read_parquet(chunk, columns=columns, engine=self.PARQUET_ENGINE, **kwargs)