예제 #1
0
def test_set_col_type_category_from_types(col_input: Any, expected_cat: str):
    meta = Metadata(columns=col_input)
    meta.set_col_type_category_from_types()
    assert meta.columns[0]["type_category"] == expected_cat
예제 #2
0
def test_unpack_complex_data_type(data_type, expected):
    meta = Metadata()
    assert _unpack_complex_data_type(data_type) == expected
    assert meta.unpack_complex_data_type(data_type) == expected
예제 #3
0
def test_set_col_types_from_type_category():
    test_dict = {
        "name":
        "test",
        "description":
        "test",
        "file_format":
        "test",
        "sensitive":
        False,
        "columns": [
            {
                "name": "test_null",
                "type_category": "null"
            },
            {
                "name": "test_integer",
                "type_category": "integer"
            },
            {
                "name": "test_float",
                "type_category": "float"
            },
            {
                "name": "test_string",
                "type_category": "string"
            },
            {
                "name": "test_timestamp",
                "type_category": "timestamp"
            },
            {
                "name": "test_binary",
                "type_category": "binary"
            },
            {
                "name": "test_boolean",
                "type_category": "boolean"
            },
            {
                "name": "test_list",
                "type_category": "list"
            },
            {
                "name": "test_struct",
                "type_category": "struct"
            },
        ],
    }
    meta = Metadata.from_dict(test_dict)
    with pytest.warns(UserWarning):
        meta.set_col_types_from_type_category()

    for c in meta.columns:
        default_type_cat = c["name"].replace("test_", "")
        expected_type = meta.default_type_category_lookup.get(default_type_cat)
        assert c["type"] == expected_type

    new_dict = {
        "null": "null",
        "integer": "uint8",
        "float": "decimal128(2,5)",
        "string": "large_string",
        "timestamp": "timestamp(us)",
        "binary": "large_binary",
        "boolean": "bool_",
        "list": "large_list<null>",
        "struct": "map_<null>",
    }

    meta2 = Metadata.from_dict(test_dict)
    meta2.set_col_types_from_type_category(
        lambda x: new_dict.get(x["type_category"]))

    for c in meta2.columns:
        default_type_cat = c["name"].replace("test_", "")
        assert c["type"] == new_dict.get(default_type_cat)
예제 #4
0
def test_columns_pass(col_input: Any):
    Metadata(columns=col_input)
예제 #5
0
def test_columns_default():
    metadata = Metadata()
    assert metadata.columns == []
예제 #6
0
def test_data_override_merge(m1, m2, data, expected_name):
    assert Metadata.merge(m1, m2, data_override=data).name == expected_name
예제 #7
0
def test_columns_validation_error(col_input: Any):
    metadata = Metadata()
    with pytest.raises(ValidationError):
        metadata.columns = col_input
예제 #8
0
def test_cols_merge(m1, m2, expected_cols):
    assert sorted(x.items() for x in Metadata.merge(m1, m2).columns) == sorted(
        x.items() for x in expected_cols)
예제 #9
0
def test_params_merge(m1, m2, expected_partitions):
    assert Metadata.merge(m1, m2).partitions == expected_partitions
예제 #10
0
def test_merge_error_raised(m1, m2):
    with pytest.raises(ValueError):
        Metadata.merge(m1, m2, mismatch="error")
예제 #11
0
def test_inferred_input_fails(fake_input):
    with pytest.raises(TypeError):
        Metadata.from_infer(fake_input)
예제 #12
0
def test_inferred_input_passes(monkeypatch, patch_out, fake_input):
    monkeypatch.setattr(Metadata, patch_out, lambda x: True)
    assert Metadata.from_infer(fake_input)
예제 #13
0
        0.0,
        [],
        (),
    ],
)
def test_inferred_input_fails(fake_input):
    with pytest.raises(TypeError):
        Metadata.from_infer(fake_input)


merge_meta_test = Metadata.from_dict({
    "name":
    "merge_test",
    "columns": [{
        "name": "c1",
        "type": "int64"
    }, {
        "name": "c2",
        "type": "string"
    }],
    "partitions": ["c1"],
})

merge_meta_diff_col_type = Metadata.from_dict({
    "name":
    "merge_test",
    "columns": [
        {
            "name": "c3",
            "type": "string"
        },
        {
예제 #14
0
def _parse_data_to_pandas(filepath: str, table_params: dict, metadata: dict):
    """
    Reads in the data from the given filepath and returns
    a dataframe
    """

    meta_col_names = [
        c["name"] for c in metadata["columns"]
        if c["name"] not in metadata.get("partitions", [])
    ]

    # For string based file types convert make arrow readers read them in as strings
    # validators will still treat these as dates but will run validation against strings
    # cols expecting values to match a timestamp format
    if "json" in metadata["file_format"] or "csv" in metadata["file_format"]:
        md_obj = Metadata.from_dict(metadata)
        cols = md_obj.columns

        cols_to_force_str_read_in = []
        for c in cols:
            if c["type"].startswith("time") or c["type"].startswith("date"):
                c["type"] = "string"
                c["type_category"] = "string"
                cols_to_force_str_read_in.append(c["name"])

        md_obj.columns = cols
        ac = ArrowConverter()
        arrow_schema = ac.generate_from_meta(md_obj)

        ts_as_str_schema = pa.schema([])
        for cname in cols_to_force_str_read_in:
            ts_as_str_schema = ts_as_str_schema.append(
                arrow_schema.field(cname))

    # Set the reader type
    if filepath.startswith("s3://"):
        reader_fs = fs.S3FileSystem(region="eu-west-1")
        fp_for_file_reader = filepath.replace("s3://", "", 1)

    else:
        reader_fs = fs.LocalFileSystem()
        fp_for_file_reader = filepath

    with reader_fs.open_input_stream(fp_for_file_reader) as f:
        if "csv" in metadata["file_format"]:

            # Safer CSV load for newlines_in_values set to True
            if table_params.get("expect-header", True):
                po = csv.ParseOptions(newlines_in_values=True)
            else:
                po = csv.ParseOptions(newlines_in_values=True,
                                      column_names=meta_col_names)

            if ts_as_str_schema:
                co = csv.ConvertOptions(column_types=ts_as_str_schema)
            else:
                co = None

            df = pa_read_csv_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
                convert_options=co,
            )
            # dates/datetimes == string

        elif "json" in metadata["file_format"]:

            po = json.ParseOptions(
                newlines_in_values=True,
                explicit_schema=ts_as_str_schema if ts_as_str_schema else None,
            )

            df = pa_read_json_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
            )
            # dates/datetimes == string

        elif "parquet" in metadata["file_format"]:
            df = arrow_to_pandas(pq.read_table(f))
            # dates/datetimes == datetime / date

        else:
            raise ValueError(
                f"Unknown file_format in metadata: {metadata['file_format']}.")

    if table_params.get("row-limit"):
        df = df.sample(table_params.get("row-limit"))

    if table_params.get("headers-ignore-case"):
        df_cols = [c.lower() for c in df.columns]
        df.columns = df_cols

    if table_params.get("only-test-cols-in-metadata", False):
        keep_cols = [c for c in df.columns if c in meta_col_names]
        df = df[keep_cols]

    return df