def test_ParquetExporter_with_schema_wrong_number(event, schema_wrong): (prefix, file) = schema_wrong() event_data = event(prefix, chunksize=None, schema=SCHEMA) exporter = ParquetExporter(event_data) with pytest.raises( ConversionError, match= ".*cannot safely convert passed user dtype of bool for int64.*", ): exporter.export()
def test_ParquetExporter_no_chunks(event, husholdninger_single): (prefix, file) = husholdninger_single() event_data = event(prefix, chunksize=None) exporter = ParquetExporter(event_data) exporter.export() expected = pd.read_csv(file) output_prefix = event_data["payload"]["output_dataset"][ "s3_prefix"].replace("%stage%", "intermediate") result = pd.read_parquet(f"s3://{BUCKET}/" + output_prefix + event_data["task"] + "/husholdninger.parquet.gz") # result is a subset of expected assert len(result.merge(expected)) == len(result)
def export_and_read_result(event_data, outputprefix): exporter = ParquetExporter(event_data) exporter.export() output_prefix = event_data["payload"]["output_dataset"][ "s3_prefix"].replace("%stage%", "intermediate") fs = s3fs.core.S3FileSystem() result_path = f"{BUCKET}/{output_prefix}{event_data['task']}/{outputprefix}*" source_paths = fs.glob(path=result_path) result = pd.concat( pd.read_parquet(f"s3://{parquet_file}") for parquet_file in source_paths) return result
def test_ParquetExporter_date_with_string(event, dates_file_date_string_value): (prefix, file) = dates_file_date_string_value() event_data = event(prefix, chunksize=None, schema=schema_dates) exporter = ParquetExporter(event_data) result = exporter.export() assert result["status"] == "CONVERSION_FAILED" assert len(result["errors"]) == 1
def test_ParquetExporter_chunked(event, husholdninger_single): prefix, file = husholdninger_single() event_data = event(prefix, chunksize=2) exporter = ParquetExporter(event_data) output_prefix = event_data["payload"]["output_dataset"][ "s3_prefix"].replace("%stage%", "intermediate") task = event_data["task"] with patch.object(exporter, "_parallel_export") as mocked_parallel_export: exporter.export() mocked_parallel_export.assert_called_once_with( "husholdninger", ANY, None, f"s3://{BUCKET}/{output_prefix}{task}/husholdninger", )
def test_ParquetExporter_invalid_year_too_late(event, dates_file_year_too_late): (prefix, file) = dates_file_year_too_late() event_data = event(prefix, chunksize=None, schema=schema_dates) exporter = ParquetExporter(event_data) result = exporter.export() assert result["status"] == "CONVERSION_FAILED" assert len(result["errors"]) == 1
def test_ParquetExporter_with_schema(event, schema): (prefix, file) = schema() event_data = event(prefix, chunksize=None, schema=SCHEMA) exporter = ParquetExporter(event_data) exporter.export() output_prefix = event_data["payload"]["output_dataset"][ "s3_prefix"].replace("%stage%", "intermediate") result = pd.read_parquet(f"s3://{BUCKET}/" + output_prefix + event_data["task"] + "/schema.parquet.gz") assert list(result.dtypes)[1].name == "bool" assert list(result.dtypes)[2].name == "bool" assert list(result.dtypes)[3].name == "float64" assert list(result.dtypes)[4].name == "datetime64[ns]" assert list(result["date"])[0] == pd.Timestamp("2020-03-14") assert list(result["date"])[1] == pd.Timestamp("2020-01-01") assert pd.isnull(list(result["date"])[2])