Python read_parquet示例

编程语言: Python

命名空间/包名称: flytekit.plugins.pandas

方法/功能: read_parquet

hotexamples.com的示例: 3

Python read_parquet - 已找到3个示例。这些是从开源项目中提取的最受好评的flytekit.plugins.pandas.read_parquet现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

    def _read_parquet_with_type_promotion_override(chunk, columns, parquet_engine):
        """
        This wrapper function of pd.read_parquet() is a hack intended to fix the type promotion problem
        when using fastparquet as the underlying parquet engine.

        When using fastparquet, boolean columns containing None values will be promoted to float16 columns.
        This behavior is inconsistent with what Pandas and Pyarrow does, which promote such columns
        to object columns. This becomes problematic when users want to write the dataframe back into parquet
        file because float16 (halffloat) is not a supported type in parquet spec. In this function, we detect
        such columns and do override the type promotion.
        """
        df = None

        if parquet_engine == 'fastparquet':
            from fastparquet import ParquetFile as _ParquetFile
            import fastparquet.thrift_structures as _ts

            # https://github.com/dask/fastparquet/issues/414#issuecomment-478983811
            df = _pd.read_parquet(chunk, columns=columns, engine=parquet_engine, index=False)
            df_column_types = df.dtypes
            pf = _ParquetFile(chunk)
            schema_column_dtypes = {l.name: l.type for l in list(pf.schema.schema_elements)}

            for idx in df_column_types[df_column_types == 'float16'].index.tolist():
                # A hacky way to get the string representations of the column types of a parquet schema
                # Reference:
                # https://github.com/dask/fastparquet/blob/f4ecc67f50e7bf98b2d0099c9589c615ea4b06aa/fastparquet/schema.py
                if _ts.parquet_thrift.Type._VALUES_TO_NAMES[schema_column_dtypes[idx]] == "BOOLEAN":
                    df[idx] = df[idx].astype('object')
                    df[idx].replace({0: False, 1: True, _pd.np.nan: None}, inplace=True)

        else:
            df = _pd.read_parquet(chunk, columns=columns, engine=parquet_engine)

        return df

示例#2

显示文件

    def _read(self, chunk: os.PathLike, columns: typing.List[str],
              **kwargs) -> pandas.DataFrame:
        from fastparquet import ParquetFile as _ParquetFile
        from fastparquet import thrift_structures as _ts

        # TODO Follow up to figure out if this is not needed anymore
        # https://github.com/dask/fastparquet/issues/414#issuecomment-478983811
        df = pandas.read_parquet(chunk,
                                 columns=columns,
                                 engine=self.PARQUET_ENGINE,
                                 index=False)
        df_column_types = df.dtypes
        pf = _ParquetFile(chunk)
        schema_column_dtypes = {
            l.name: l.type
            for l in list(pf.schema.schema_elements)
        }

        for idx in df_column_types[df_column_types ==
                                   "float16"].index.tolist():
            # A hacky way to get the string representations of the column types of a parquet schema
            # Reference:
            # https://github.com/dask/fastparquet/blob/f4ecc67f50e7bf98b2d0099c9589c615ea4b06aa/fastparquet/schema.py
            if _ts.parquet_thrift.Type._VALUES_TO_NAMES[
                    schema_column_dtypes[idx]] == "BOOLEAN":
                df[idx] = df[idx].astype("object")
                df[idx].replace({
                    0: False,
                    1: True,
                    pandas.np.nan: None
                },
                                inplace=True)
        return df

示例#3

显示文件

 def _read(self, chunk: os.PathLike, columns: typing.List[str],
           **kwargs) -> pandas.DataFrame:
     return pandas.read_parquet(chunk,
                                columns=columns,
                                engine=self.PARQUET_ENGINE,
                                **kwargs)