def test_split_csv_with_compression(self, boto_mock):
        self.get_gziped_file()

        total = 0
        for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify(
            file_name=self.csv_gziped, chunksize=10
        ):
            chunk = pd.read_parquet(file_name)
            total += sum(chunk["Sunspots"])

            os.unlink(file_name)

        os.unlink(self.csv_gziped)

        self.assertEqual(total, 2187)
    def test_split_csv_with_dtype(self, boto_mock):
        self.get_files()

        total = 0
        for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify(
            file_name=self.csv1, chunksize=10, dtype={"Month": "utf8", "Sunsports": "float"}
        ):
            chunk = pd.read_parquet(file_name)
            total += sum(chunk["Sunspots"])

            os.unlink(file_name)

        os.unlink(self.csv1)

        self.assertEqual(total, 2187)
    def test_split_csv_with_skip_rows(self, boto_mock):
        self.get_skip_rows_file()

        total = 0
        for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify(
            file_name=self.csv_skiprows, chunksize=10, skip_rows=1
        ):
            chunk = pd.read_parquet(file_name)
            total += sum(chunk["Sunspots"])

            os.unlink(file_name)

        os.unlink(self.csv_skiprows)

        self.assertEqual(total, 2187)
    def test_parquetify_csv_valid(self, boto_mock):
        self.get_files()

        total = 0
        for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify(
            file_name=self.csv1, chunksize=10, dtype={"Month": "utf8", "Sunsports": "float"}
        ):
            file_path = os.path.join(os.getcwd(), file_name)
            df_temp = pd.read_parquet(file_path)

            total += sum(df_temp["Sunspots"])

            os.unlink(file_path)

        self.assertEqual(total, 2187)
    def test_parquetify_csv_with_extra_columns(self, boto_mock):
        self.get_files()

        def pre_process(chunk):
            chunk["Extra Column"] = None
            dtype = {"Month": "utf8", "Sunsports": "float", "Extra Column": "utf8"}

            return dtype, chunk

        for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify(
            file_name=self.csv1, chunksize=10, pre_process_chunk=pre_process,
        ):

            file_path = os.path.join(os.getcwd(), file_name)
            df_temp = pd.read_parquet(file_path)

            self.assertIn("Extra Column", list(df_temp.columns))

            os.unlink(file_path)
    def test_parquetify_csv_with_preprocessor(self, boto_mock):
        self.get_files()

        def pre_process(chunk):
            chunk["Sunspots"] = chunk["Sunspots"] + 1
            dtype = {"Month": "utf8", "Sunsports": "float"}

            return dtype, chunk

        total = 0
        for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify(
            file_name=self.csv1, chunksize=10, pre_process_chunk=pre_process,
        ):

            file_path = os.path.join(os.getcwd(), file_name)
            df_temp = pd.read_parquet(file_path)

            total += sum(df_temp["Sunspots"])

            os.unlink(file_path)

        self.assertEqual(total, 2187 + 28)
Пример #7
0
from s3_parquetifier import S3Parquetifier

# Call the covertor
S3Parquetifier(
    source_bucket="aws-uggr-demo",
    target_bucket="aws-uggr-demo",
    verbose=True,  # for verbosity or not
).convert_from_s3(
    source_key="titanic.csv",
    target_key="parquet/data/",
    chunk_size=100  # The number of rows per parquet
)