Пример #1
0
def test_ParquetExporter_date_with_string(event, dates_file_date_string_value):
    (prefix, file) = dates_file_date_string_value()
    event_data = event(prefix, chunksize=None, schema=schema_dates)
    exporter = ParquetExporter(event_data)
    result = exporter.export()
    assert result["status"] == "CONVERSION_FAILED"
    assert len(result["errors"]) == 1
Пример #2
0
def test_ParquetExporter_invalid_year_too_late(event,
                                               dates_file_year_too_late):
    (prefix, file) = dates_file_year_too_late()
    event_data = event(prefix, chunksize=None, schema=schema_dates)
    exporter = ParquetExporter(event_data)
    result = exporter.export()
    assert result["status"] == "CONVERSION_FAILED"
    assert len(result["errors"]) == 1
Пример #3
0
def test_ParquetExporter_with_schema_wrong_number(event, schema_wrong):
    (prefix, file) = schema_wrong()
    event_data = event(prefix, chunksize=None, schema=SCHEMA)
    exporter = ParquetExporter(event_data)
    with pytest.raises(
            ConversionError,
            match=
            ".*cannot safely convert passed user dtype of bool for int64.*",
    ):
        exporter.export()
Пример #4
0
def test_ParquetExporter_no_chunks(event, husholdninger_single):
    (prefix, file) = husholdninger_single()
    event_data = event(prefix, chunksize=None)
    exporter = ParquetExporter(event_data)
    exporter.export()
    expected = pd.read_csv(file)
    output_prefix = event_data["payload"]["output_dataset"][
        "s3_prefix"].replace("%stage%", "intermediate")

    result = pd.read_parquet(f"s3://{BUCKET}/" + output_prefix +
                             event_data["task"] + "/husholdninger.parquet.gz")
    # result is a subset of expected
    assert len(result.merge(expected)) == len(result)
Пример #5
0
def export_and_read_result(event_data, outputprefix):
    exporter = ParquetExporter(event_data)
    exporter.export()
    output_prefix = event_data["payload"]["output_dataset"][
        "s3_prefix"].replace("%stage%", "intermediate")
    fs = s3fs.core.S3FileSystem()
    result_path = f"{BUCKET}/{output_prefix}{event_data['task']}/{outputprefix}*"
    source_paths = fs.glob(path=result_path)

    result = pd.concat(
        pd.read_parquet(f"s3://{parquet_file}")
        for parquet_file in source_paths)
    return result
Пример #6
0
def test_ParquetExporter_chunked(event, husholdninger_single):
    prefix, file = husholdninger_single()
    event_data = event(prefix, chunksize=2)
    exporter = ParquetExporter(event_data)
    output_prefix = event_data["payload"]["output_dataset"][
        "s3_prefix"].replace("%stage%", "intermediate")
    task = event_data["task"]

    with patch.object(exporter, "_parallel_export") as mocked_parallel_export:
        exporter.export()
        mocked_parallel_export.assert_called_once_with(
            "husholdninger",
            ANY,
            None,
            f"s3://{BUCKET}/{output_prefix}{task}/husholdninger",
        )
Пример #7
0
def test_ParquetExporter_with_schema(event, schema):
    (prefix, file) = schema()
    event_data = event(prefix, chunksize=None, schema=SCHEMA)
    exporter = ParquetExporter(event_data)
    exporter.export()

    output_prefix = event_data["payload"]["output_dataset"][
        "s3_prefix"].replace("%stage%", "intermediate")

    result = pd.read_parquet(f"s3://{BUCKET}/" + output_prefix +
                             event_data["task"] + "/schema.parquet.gz")
    assert list(result.dtypes)[1].name == "bool"
    assert list(result.dtypes)[2].name == "bool"
    assert list(result.dtypes)[3].name == "float64"
    assert list(result.dtypes)[4].name == "datetime64[ns]"

    assert list(result["date"])[0] == pd.Timestamp("2020-03-14")
    assert list(result["date"])[1] == pd.Timestamp("2020-01-01")
    assert pd.isnull(list(result["date"])[2])