Python S3Parquetifier示例

编程语言: Python

命名空间/包名称: s3_parquetifier

类/类型: S3Parquetifier

hotexamples.com的示例: 7

Python S3Parquetifier - 已找到7个示例。这些是从开源项目中提取的最受好评的s3_parquetifier.S3Parquetifier现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

S3Parquetifier(7)

示例#1

显示文件

文件： test_s3_parquetifier.py 项目： Orfium/s3-parquetifier

    def test_split_csv_with_compression(self, boto_mock):
        self.get_gziped_file()

        total = 0
        for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify(
            file_name=self.csv_gziped, chunksize=10
        ):
            chunk = pd.read_parquet(file_name)
            total += sum(chunk["Sunspots"])

            os.unlink(file_name)

        os.unlink(self.csv_gziped)

        self.assertEqual(total, 2187)

示例#2

显示文件

文件： test_s3_parquetifier.py 项目： Orfium/s3-parquetifier

    def test_split_csv_with_dtype(self, boto_mock):
        self.get_files()

        total = 0
        for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify(
            file_name=self.csv1, chunksize=10, dtype={"Month": "utf8", "Sunsports": "float"}
        ):
            chunk = pd.read_parquet(file_name)
            total += sum(chunk["Sunspots"])

            os.unlink(file_name)

        os.unlink(self.csv1)

        self.assertEqual(total, 2187)

示例#3

显示文件

文件： test_s3_parquetifier.py 项目： Orfium/s3-parquetifier

    def test_split_csv_with_skip_rows(self, boto_mock):
        self.get_skip_rows_file()

        total = 0
        for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify(
            file_name=self.csv_skiprows, chunksize=10, skip_rows=1
        ):
            chunk = pd.read_parquet(file_name)
            total += sum(chunk["Sunspots"])

            os.unlink(file_name)

        os.unlink(self.csv_skiprows)

        self.assertEqual(total, 2187)

示例#4

显示文件

文件： test_s3_parquetifier.py 项目： Orfium/s3-parquetifier

    def test_parquetify_csv_valid(self, boto_mock):
        self.get_files()

        total = 0
        for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify(
            file_name=self.csv1, chunksize=10, dtype={"Month": "utf8", "Sunsports": "float"}
        ):
            file_path = os.path.join(os.getcwd(), file_name)
            df_temp = pd.read_parquet(file_path)

            total += sum(df_temp["Sunspots"])

            os.unlink(file_path)

        self.assertEqual(total, 2187)

示例#5

显示文件

文件： test_s3_parquetifier.py 项目： Orfium/s3-parquetifier

    def test_parquetify_csv_with_extra_columns(self, boto_mock):
        self.get_files()

        def pre_process(chunk):
            chunk["Extra Column"] = None
            dtype = {"Month": "utf8", "Sunsports": "float", "Extra Column": "utf8"}

            return dtype, chunk

        for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify(
            file_name=self.csv1, chunksize=10, pre_process_chunk=pre_process,
        ):

            file_path = os.path.join(os.getcwd(), file_name)
            df_temp = pd.read_parquet(file_path)

            self.assertIn("Extra Column", list(df_temp.columns))

            os.unlink(file_path)

示例#6

显示文件

文件： test_s3_parquetifier.py 项目： Orfium/s3-parquetifier

    def test_parquetify_csv_with_preprocessor(self, boto_mock):
        self.get_files()

        def pre_process(chunk):
            chunk["Sunspots"] = chunk["Sunspots"] + 1
            dtype = {"Month": "utf8", "Sunsports": "float"}

            return dtype, chunk

        total = 0
        for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify(
            file_name=self.csv1, chunksize=10, pre_process_chunk=pre_process,
        ):

            file_path = os.path.join(os.getcwd(), file_name)
            df_temp = pd.read_parquet(file_path)

            total += sum(df_temp["Sunspots"])

            os.unlink(file_path)

        self.assertEqual(total, 2187 + 28)

示例#7

显示文件

from s3_parquetifier import S3Parquetifier

# Call the covertor
S3Parquetifier(
    source_bucket="aws-uggr-demo",
    target_bucket="aws-uggr-demo",
    verbose=True,  # for verbosity or not
).convert_from_s3(
    source_key="titanic.csv",
    target_key="parquet/data/",
    chunk_size=100  # The number of rows per parquet
)