Python S3Parquetifier примеры использования

Язык программирования: Python

Пространство имен/Пакет: s3_parquetifier

Класс/Тип: S3Parquetifier

Примеров на hotexamples.com: 7

Python S3Parquetifier - 7 примеров найдено. Это лучшие примеры Python кода для s3_parquetifier.S3Parquetifier, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

S3Parquetifier(7)

Основные методы

S3Parquetifier (7)

Пример #1

0

Показать файл

Файл: test_s3_parquetifier.py Проект: Orfium/s3-parquetifier

    def test_split_csv_with_compression(self, boto_mock):
        self.get_gziped_file()

        total = 0
        for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify(
            file_name=self.csv_gziped, chunksize=10
        ):
            chunk = pd.read_parquet(file_name)
            total += sum(chunk["Sunspots"])

            os.unlink(file_name)

        os.unlink(self.csv_gziped)

        self.assertEqual(total, 2187)

Пример #2

0

Показать файл

Файл: test_s3_parquetifier.py Проект: Orfium/s3-parquetifier

    def test_split_csv_with_dtype(self, boto_mock):
        self.get_files()

        total = 0
        for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify(
            file_name=self.csv1, chunksize=10, dtype={"Month": "utf8", "Sunsports": "float"}
        ):
            chunk = pd.read_parquet(file_name)
            total += sum(chunk["Sunspots"])

            os.unlink(file_name)

        os.unlink(self.csv1)

        self.assertEqual(total, 2187)

Пример #3

0

Показать файл

Файл: test_s3_parquetifier.py Проект: Orfium/s3-parquetifier

    def test_split_csv_with_skip_rows(self, boto_mock):
        self.get_skip_rows_file()

        total = 0
        for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify(
            file_name=self.csv_skiprows, chunksize=10, skip_rows=1
        ):
            chunk = pd.read_parquet(file_name)
            total += sum(chunk["Sunspots"])

            os.unlink(file_name)

        os.unlink(self.csv_skiprows)

        self.assertEqual(total, 2187)

Пример #4

0

Показать файл

Файл: test_s3_parquetifier.py Проект: Orfium/s3-parquetifier

    def test_parquetify_csv_valid(self, boto_mock):
        self.get_files()

        total = 0
        for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify(
            file_name=self.csv1, chunksize=10, dtype={"Month": "utf8", "Sunsports": "float"}
        ):
            file_path = os.path.join(os.getcwd(), file_name)
            df_temp = pd.read_parquet(file_path)

            total += sum(df_temp["Sunspots"])

            os.unlink(file_path)

        self.assertEqual(total, 2187)

Пример #5

0

Показать файл

Файл: test_s3_parquetifier.py Проект: Orfium/s3-parquetifier

    def test_parquetify_csv_with_extra_columns(self, boto_mock):
        self.get_files()

        def pre_process(chunk):
            chunk["Extra Column"] = None
            dtype = {"Month": "utf8", "Sunsports": "float", "Extra Column": "utf8"}

            return dtype, chunk

        for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify(
            file_name=self.csv1, chunksize=10, pre_process_chunk=pre_process,
        ):

            file_path = os.path.join(os.getcwd(), file_name)
            df_temp = pd.read_parquet(file_path)

            self.assertIn("Extra Column", list(df_temp.columns))

            os.unlink(file_path)

Пример #6

0

Показать файл

Файл: test_s3_parquetifier.py Проект: Orfium/s3-parquetifier

    def test_parquetify_csv_with_preprocessor(self, boto_mock):
        self.get_files()

        def pre_process(chunk):
            chunk["Sunspots"] = chunk["Sunspots"] + 1
            dtype = {"Month": "utf8", "Sunsports": "float"}

            return dtype, chunk

        total = 0
        for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify(
            file_name=self.csv1, chunksize=10, pre_process_chunk=pre_process,
        ):

            file_path = os.path.join(os.getcwd(), file_name)
            df_temp = pd.read_parquet(file_path)

            total += sum(df_temp["Sunspots"])

            os.unlink(file_path)

        self.assertEqual(total, 2187 + 28)

Пример #7

0

Показать файл

from s3_parquetifier import S3Parquetifier

# Call the covertor
S3Parquetifier(
    source_bucket="aws-uggr-demo",
    target_bucket="aws-uggr-demo",
    verbose=True,  # for verbosity or not
).convert_from_s3(
    source_key="titanic.csv",
    target_key="parquet/data/",
    chunk_size=100  # The number of rows per parquet
)