Пример #1
0
    def generate_input(self):
        if self._regression:
            (
                dtypes_meta,
                num_rows,
                num_cols,
                seed,
            ) = self.get_next_regression_params()
        else:
            seed = random.randint(0, 2**32 - 1)
            random.seed(seed)
            dtypes_list = list(cudf.utils.dtypes.ALL_TYPES)
            dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                self, dtypes_list)
            self._current_params["dtypes_meta"] = dtypes_meta
            self._current_params["seed"] = seed
            self._current_params["num_rows"] = num_rows
            self._current_params["num_columns"] = num_cols
        logging.info(f"Generating DataFrame with rows: {num_rows} "
                     f"and columns: {num_cols}")
        table = dg.rand_dataframe(dtypes_meta, num_rows, seed)
        df = pyarrow_to_pandas(table)

        logging.info(f"Shape of DataFrame generated: {df.shape}")
        self._current_buffer = df
        return df.to_csv()
Пример #2
0
    def generate_input(self):
        if self._regression:
            (
                dtypes_meta,
                num_rows,
                num_cols,
                seed,
            ) = self.get_next_regression_params()
        else:
            seed = random.randint(0, 2**32 - 1)
            random.seed(seed)
            dtypes_list = list(
                cudf.utils.dtypes.ALL_TYPES -
                {"category", "timedelta64[ns]", "datetime64[ns]"}
                # TODO: Remove uint32 below after this bug is fixed
                # https://github.com/pandas-dev/pandas/issues/37327
                - {"uint32"}
                | {"list", "decimal64"})
            dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                self, dtypes_list)
            self._current_params["dtypes_meta"] = dtypes_meta
            self._current_params["seed"] = seed
            self._current_params["num_rows"] = num_rows
            self._current_params["num_columns"] = num_cols
        logging.info(f"Generating DataFrame with rows: {num_rows} "
                     f"and columns: {num_cols}")

        table = dg.rand_dataframe(dtypes_meta, num_rows, seed)
        df = pyarrow_to_pandas(table)

        logging.info(f"Shape of DataFrame generated: {df.shape}")
        self._current_buffer = df
        return df
Пример #3
0
    def generate_input(self):
        if self._regression:
            (
                dtypes_meta,
                num_rows,
                num_cols,
                seed,
            ) = self.get_next_regression_params()
        else:
            seed = random.randint(0, 2**32 - 1)
            random.seed(seed)
            dtypes_list = list(
                cudf.utils.dtypes.ALL_TYPES
                # https://github.com/pandas-dev/pandas/issues/20599
                - {"uint64"}
                # TODO: Remove DATETIME_TYPES after this is fixed:
                # https://github.com/rapidsai/cudf/issues/6586
                - set(cudf.utils.dtypes.DATETIME_TYPES))
            dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                self, dtypes_list)
            self._current_params["dtypes_meta"] = dtypes_meta
            self._current_params["seed"] = seed
            self._current_params["num_rows"] = num_rows
            self._current_params["num_columns"] = num_cols
        logging.info(f"Generating DataFrame with rows: {num_rows} "
                     f"and columns: {num_cols}")
        table = dg.rand_dataframe(dtypes_meta, num_rows, seed)
        df = pyarrow_to_pandas(table)
        self._current_buffer = df
        logging.info(f"Shape of DataFrame generated: {df.shape}")

        return df.to_json(orient="records", lines=True)
Пример #4
0
    def generate_input(self):
        if self._regression:
            (
                dtypes_meta,
                num_rows,
                num_cols,
                seed,
            ) = self.get_next_regression_params()
        else:
            dtypes_list = list(
                cudf.utils.dtypes.ALL_TYPES - {"category"}
                # Following dtypes are not supported by orc
                # https://orc.apache.org/specification/ORCv0/
                - cudf.utils.dtypes.TIMEDELTA_TYPES -
                cudf.utils.dtypes.UNSIGNED_TYPES)

            dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                self, dtypes_list)
            self._current_params["dtypes_meta"] = dtypes_meta
            seed = random.randint(0, 2**32 - 1)
            self._current_params["seed"] = seed
            self._current_params["num_rows"] = num_rows
            self._current_params["num_cols"] = num_cols
        logging.info(f"Generating DataFrame with rows: {num_rows} "
                     f"and columns: {num_cols}")
        table = dg.rand_dataframe(dtypes_meta, num_rows, seed)
        df = pyarrow_to_pandas(table)
        logging.info(f"Shape of DataFrame generated: {table.shape}")
        self._df = df
        return df
Пример #5
0
    def generate_input(self):
        if self._regression:
            (
                dtypes_meta,
                num_rows,
                num_cols,
                seed,
            ) = self.get_next_regression_params()
        else:
            dtypes_list = list(
                cudf.utils.dtypes.ALL_TYPES
                - {"category", "datetime64[ns]"}
                - cudf.utils.dtypes.TIMEDELTA_TYPES
                # TODO: Remove uint32 below after this bug is fixed
                # https://github.com/pandas-dev/pandas/issues/37327
                - {"uint32"}
                | {"list", "decimal64"}
            )

            dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                self, dtypes_list
            )
            self._current_params["dtypes_meta"] = dtypes_meta
            seed = random.randint(0, 2 ** 32 - 1)
            self._current_params["seed"] = seed
            self._current_params["num_rows"] = num_rows
            self._current_params["num_cols"] = num_cols
        logging.info(
            f"Generating DataFrame with rows: {num_rows} "
            f"and columns: {num_cols}"
        )
        table = dg.rand_dataframe(dtypes_meta, num_rows, seed)
        df = pyarrow_to_pandas(table)
        logging.info(f"Shape of DataFrame generated: {table.shape}")

        # TODO: Change this to write into
        # a BytesIO object once below issue is fixed
        # https://issues.apache.org/jira/browse/ARROW-10123

        # file = io.BytesIO()

        df.to_parquet("temp_file")
        # file.seek(0)
        # self._current_buffer = copy.copy(file.read())
        # return self._current_buffer
        self._df = df
        return "temp_file"
Пример #6
0
Файл: orc.py Проект: rongou/cudf
    def generate_input(self):
        if self._regression:
            (
                dtypes_meta,
                num_rows,
                num_cols,
                seed,
            ) = self.get_next_regression_params()
        else:
            dtypes_list = list(
                cudf.utils.dtypes.ALL_TYPES
                - {"category"}
                # Following dtypes are not supported by orc
                # https://orc.apache.org/specification/ORCv0/
                - cudf.utils.dtypes.TIMEDELTA_TYPES
                - cudf.utils.dtypes.UNSIGNED_TYPES
                - {"datetime64[ns]"}
            )

            dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                self, dtypes_list
            )

            self._current_params["dtypes_meta"] = dtypes_meta
            seed = random.randint(0, 2 ** 32 - 1)
            self._current_params["seed"] = seed
            self._current_params["num_rows"] = num_rows
            self._current_params["num_cols"] = num_cols
        logging.info(
            f"Generating DataFrame with rows: {num_rows} "
            f"and columns: {num_cols}"
        )
        table = dg.rand_dataframe(dtypes_meta, num_rows, seed)
        df = pyarrow_to_pandas(table)
        logging.info(f"Shape of DataFrame generated: {table.shape}")
        self._df = df
        file_obj = io.BytesIO()
        pandas_to_orc(
            df,
            file_io_obj=file_obj,
            stripe_size=self._rand(len(df)),
            arrow_table_schema=table.schema,
        )
        file_obj.seek(0)
        buf = file_obj.read()
        self._current_buffer = copy.copy(buf)
        return (df, buf)
Пример #7
0
    def generate_input(self):
        if self._regression:
            (
                dtypes_meta,
                num_rows,
                num_cols,
                seed,
            ) = self.get_next_regression_params()
        else:
            dtypes_list = list(
                cudf.utils.dtypes.ALL_TYPES - {"category"}
                # No unsigned support in avro:
                # https://avro.apache.org/docs/current/spec.html
                - cudf.utils.dtypes.UNSIGNED_TYPES
                # TODO: Remove DATETIME_TYPES once
                # following bug is fixed:
                # https://github.com/rapidsai/cudf/issues/6482
                - cudf.utils.dtypes.DATETIME_TYPES
                # TODO: Remove DURATION_TYPES once
                # following bug is fixed:
                # https://github.com/rapidsai/cudf/issues/6604
                - cudf.utils.dtypes.TIMEDELTA_TYPES)

            dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                self, dtypes_list)
            self._current_params["dtypes_meta"] = dtypes_meta
            seed = random.randint(0, 2**32 - 1)
            self._current_params["seed"] = seed
            self._current_params["num_rows"] = num_rows
            self._current_params["num_cols"] = num_cols
        logging.info(f"Generating DataFrame with rows: {num_rows} "
                     f"and columns: {num_cols}")
        table = dg.rand_dataframe(dtypes_meta, num_rows, seed)
        df = pyarrow_to_pandas(table)
        self._df = df
        logging.info(f"Shape of DataFrame generated: {table.shape}")

        file_obj = io.BytesIO()
        pandas_to_avro(df, file_io_obj=file_obj)
        file_obj.seek(0)
        buf = file_obj.read()
        self._current_buffer = copy.copy(buf)
        return (df, buf)
Пример #8
0
Файл: orc.py Проект: rongou/cudf
    def generate_input(self):
        if self._regression:
            (
                dtypes_meta,
                num_rows,
                num_cols,
                seed,
            ) = self.get_next_regression_params()
        else:
            dtypes_list = list(
                cudf.utils.dtypes.ALL_TYPES
                # TODO: Remove "bool" from below
                # list after following issue is fixed:
                # https://github.com/rapidsai/cudf/issues/6763
                - {"category", "bool"}
                # Following dtypes are not supported by orc
                # https://orc.apache.org/specification/ORCv0/
                - cudf.utils.dtypes.TIMEDELTA_TYPES
                - cudf.utils.dtypes.UNSIGNED_TYPES
                # TODO: Remove `DATETIME_TYPES` once
                # following bug is fixed:
                # https://github.com/rapidsai/cudf/issues/7355
                - cudf.utils.dtypes.DATETIME_TYPES
            )

            dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                self, dtypes_list
            )
            self._current_params["dtypes_meta"] = dtypes_meta
            seed = random.randint(0, 2 ** 32 - 1)
            self._current_params["seed"] = seed
            self._current_params["num_rows"] = num_rows
            self._current_params["num_cols"] = num_cols
        logging.info(
            f"Generating DataFrame with rows: {num_rows} "
            f"and columns: {num_cols}"
        )
        table = dg.rand_dataframe(dtypes_meta, num_rows, seed)
        df = pyarrow_to_pandas(table)
        logging.info(f"Shape of DataFrame generated: {table.shape}")
        self._df = df
        return df