def test_split_csv_with_compression(self, boto_mock): self.get_gziped_file() total = 0 for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify( file_name=self.csv_gziped, chunksize=10 ): chunk = pd.read_parquet(file_name) total += sum(chunk["Sunspots"]) os.unlink(file_name) os.unlink(self.csv_gziped) self.assertEqual(total, 2187)
def test_split_csv_with_dtype(self, boto_mock): self.get_files() total = 0 for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify( file_name=self.csv1, chunksize=10, dtype={"Month": "utf8", "Sunsports": "float"} ): chunk = pd.read_parquet(file_name) total += sum(chunk["Sunspots"]) os.unlink(file_name) os.unlink(self.csv1) self.assertEqual(total, 2187)
def test_split_csv_with_skip_rows(self, boto_mock): self.get_skip_rows_file() total = 0 for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify( file_name=self.csv_skiprows, chunksize=10, skip_rows=1 ): chunk = pd.read_parquet(file_name) total += sum(chunk["Sunspots"]) os.unlink(file_name) os.unlink(self.csv_skiprows) self.assertEqual(total, 2187)
def test_parquetify_csv_valid(self, boto_mock): self.get_files() total = 0 for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify( file_name=self.csv1, chunksize=10, dtype={"Month": "utf8", "Sunsports": "float"} ): file_path = os.path.join(os.getcwd(), file_name) df_temp = pd.read_parquet(file_path) total += sum(df_temp["Sunspots"]) os.unlink(file_path) self.assertEqual(total, 2187)
def test_parquetify_csv_with_extra_columns(self, boto_mock): self.get_files() def pre_process(chunk): chunk["Extra Column"] = None dtype = {"Month": "utf8", "Sunsports": "float", "Extra Column": "utf8"} return dtype, chunk for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify( file_name=self.csv1, chunksize=10, pre_process_chunk=pre_process, ): file_path = os.path.join(os.getcwd(), file_name) df_temp = pd.read_parquet(file_path) self.assertIn("Extra Column", list(df_temp.columns)) os.unlink(file_path)
def test_parquetify_csv_with_preprocessor(self, boto_mock): self.get_files() def pre_process(chunk): chunk["Sunspots"] = chunk["Sunspots"] + 1 dtype = {"Month": "utf8", "Sunsports": "float"} return dtype, chunk total = 0 for file_name in S3Parquetifier("source_bucket", "target_bucket")._parquetify( file_name=self.csv1, chunksize=10, pre_process_chunk=pre_process, ): file_path = os.path.join(os.getcwd(), file_name) df_temp = pd.read_parquet(file_path) total += sum(df_temp["Sunspots"]) os.unlink(file_path) self.assertEqual(total, 2187 + 28)
from s3_parquetifier import S3Parquetifier # Call the covertor S3Parquetifier( source_bucket="aws-uggr-demo", target_bucket="aws-uggr-demo", verbose=True, # for verbosity or not ).convert_from_s3( source_key="titanic.csv", target_key="parquet/data/", chunk_size=100 # The number of rows per parquet )