def test_columnar_convert_type_column(): converter = ColumnarConverter( "some_name", "foo", "type_column", {}, { "type_column": "TC", "data": "D" }, False, {}, ) df = pd.DataFrame( { "type_column": ["c", "a", "a", "c", "b"], "data": [1, 2, 3, 4, 5] }, index=[1, 10, 100, 1000, 10000], ) shared, type_starts, features = converter.convert(df) assert set(shared.columns) == {"D"} assert list(shared.index) == [10, 100, 10000, 1, 1000] assert list(shared["D"]) == [2, 3, 5, 1, 4] assert type_starts == [("a", 0), ("b", 2), ("c", 3)] assert features == {"a": None, "b": None, "c": None} # invalid configurations with pytest.raises( ValueError, match=r"allow_features: expected no features .* \('type_column'\)" ): ColumnarConverter("some_name", "foo", "type_column", {}, {"type_column": "TC"}, True, {}) with pytest.raises( ValueError, match= r"selected_columns: expected type column \('type_column'\) .* found only 'TC', 'data'", ): ColumnarConverter( "some_name", "foo", "type_column", {}, { "TC": "type_column", "data": "D" }, False, {}, )
def test_columnar_convert_type_column(): converter = ColumnarConverter( name="some_name", default_type="foo", type_column="type_column", column_defaults={}, selected_columns={ "type_column": "TC", "data": "D" }, transform_columns={}, ) df = pd.DataFrame( { "type_column": ["c", "a", "a", "c", "b"], "data": [1, 2, 3, 4, 5] }, index=[1, 10, 100, 1000, 10000], ) ids, columns, type_info = converter.convert(df) assert columns.keys() == {"D"} np.testing.assert_array_equal(ids, [10, 100, 10000, 1, 1000]) np.testing.assert_array_equal(columns["D"], [2, 3, 5, 1, 4]) _check_type_info( type_info, [("a", _empty_array(2)), ("b", _empty_array(1)), ("c", _empty_array(2))], ) # invalid configuration with pytest.raises( ValueError, match= r"selected_columns: expected type column \('type_column'\) .* found only 'TC', 'data'", ): ColumnarConverter( name="some_name", default_type="foo", type_column="type_column", column_defaults={}, selected_columns={ "TC": "type_column", "data": "D" }, transform_columns={}, )
def test_columnar_convert_features(): converter = ColumnarConverter("some_name", "foo", {}, {"x": "x"}, True) df = _EMPTY_DF.assign(a=[1, 2], b=[100, 200], x=123) shared, features = converter.convert(df) assert all(shared["foo"]["x"] == 123) assert np.array_equal(features["foo"], [[1, 100], [2, 200]])
def test_columnar_convert_rowframe_ndarray_invalid(): converter = ColumnarConverter( "some_name", "foo", None, column_defaults={}, selected_columns={"bar": "baz"}, transform_columns={}, ) frame = IndexedArray(np.random.rand(3, 4, 5)) with pytest.raises( ValueError, match= r"some_name\['foo'\]: expected a Pandas DataFrame when selecting columns 'bar', found IndexedArray", ): converter.convert(frame) with pytest.raises( ValueError, match= r"some_name\['foo'\]: expected a Pandas DataFrame when selecting columns 'bar', found ndarray", ): converter.convert(frame.values)
def test_columnar_convert_rowframe(): converter = ColumnarConverter( "some_name", "foo", None, column_defaults={}, selected_columns={}, transform_columns={}, ) frame1 = IndexedArray(np.random.rand(3, 4, 5), index=[1111, -222, 33]) frame2 = IndexedArray(np.random.rand(6, 7)) ids, columns, type_info = converter.convert(frame1) assert ids == [1111, -222, 33] assert columns == {} _check_type_info(type_info, [("foo", frame1.values)]) # check identity, to validate non-copying assert type_info[0][1] is frame1.values ids, columns, type_info = converter.convert({"a": frame1, "b": frame2}) np.testing.assert_array_equal(ids, [*frame1.index, *frame2.index]) assert columns == {} _check_type_info(type_info, [("a", frame1.values), ("b", frame2.values)]) assert type_info[0][1] is frame1.values assert type_info[1][1] is frame2.values
def test_columnar_convert_column_default(): converter = ColumnarConverter("some_name", "foo", {"before": 123}, {}, False) shared, features = converter.convert({"x": _EMPTY_DF, "y": _EMPTY_DF}) assert "x" in shared assert "y" in shared for df in shared.values(): assert all(df["before"] == 123)
def test_columnar_convert_selected_columns_missing(): converter = ColumnarConverter( "some_name", "foo", {}, {"before": "after", "same": "same"}, False ) with pytest.raises( ValueError, match=r"some_name\['x'\]: expected 'before', 'same' columns, found:" ): converter.convert({"x": _EMPTY_DF})
def test_columnar_convert_invalid_input(): converter = ColumnarConverter("some_name", "foo", None, {}, {}, False, {}) with pytest.raises(TypeError, match="some_name: expected dict, found int"): converter.convert(1) with pytest.raises( TypeError, match=r"some_name\['x'\]: expected pandas DataFrame, found int", ): converter.convert({"x": 1})
def test_columnar_convert_column_default(): converter = ColumnarConverter("some_name", "foo", None, {"before": 123}, {}, False, {}) shared, type_starts, features = converter.convert({ "x": _EMPTY_DF, "y": _EMPTY_DF }) assert type_starts == [("x", 0), ("y", 2)] assert all(shared["before"] == 123)
def test_columnar_convert_column_default_selected_columns(): # the defaulting happens before the renaming converter = ColumnarConverter("x", "foo", {"before": 123}, {"before": "after"}, False) shared, features = converter.convert({"x": _EMPTY_DF, "y": _EMPTY_DF}) assert "x" in shared assert "y" in shared for df in shared.values(): assert "before" not in df assert all(df["after"] == 123)
def test_columnar_convert_type_default(): converter = ColumnarConverter( name="some_name", default_type="foo", type_column=None, column_defaults={}, selected_columns={}, transform_columns={}, ) ids, columns, type_info = converter.convert(_EMPTY_DF) np.testing.assert_array_equal(ids, [1, 2]) assert columns == {} _check_type_info(type_info, [("foo", _empty_array(2))])
def test_columnar_convert_column_default_selected_columns(): # the defaulting happens before the renaming converter = ColumnarConverter("x", "foo", None, {"before": 123}, {"before": "after"}, False, {}) shared, type_starts, features = converter.convert({ "x": _EMPTY_DF, "y": _EMPTY_DF }) assert type_starts == [("x", 0), ("y", 2)] assert "before" not in shared assert all(shared["after"] == 123)
def test_columnar_convert_selected_columns(): df = _EMPTY_DF.assign(before="abc", same=10) converter = ColumnarConverter("some_name", "foo", None, {}, { "before": "after", "same": "same" }, False, {}) shared, type_starts, features = converter.convert({"x": df, "y": df}) assert type_starts == [("x", 0), ("y", 2)] assert "before" not in shared assert all(shared["after"] == "abc") assert all(shared["same"] == 10)
def test_columnar_convert_features(): converter = ColumnarConverter( name="some_name", default_type="foo", type_column=None, column_defaults={}, selected_columns={"x": "x"}, transform_columns={}, ) df = _EMPTY_DF.assign(a=[1, 2], b=[100, 200], x=123) ids, columns, type_info = converter.convert(df) _check_type_info(type_info, [("foo", [[1, 100], [2, 200]])]) np.testing.assert_array_equal(columns["x"], 123)
def test_columnar_convert_selected_columns(): df = _EMPTY_DF.assign(before="abc", same=10) converter = ColumnarConverter( "some_name", "foo", {}, {"before": "after", "same": "same"}, False ) shared, features = converter.convert({"x": df, "y": df}) assert "x" in shared assert "y" in shared for df in shared.values(): assert "before" not in df assert all(df["after"] == "abc") assert all(df["same"] == 10)
def test_columnar_convert_column_default(): converter = ColumnarConverter( name="some_name", default_type="foo", type_column=None, column_defaults={"before": 123}, selected_columns={"before": "before"}, transform_columns={}, ) ids, columns, type_info = converter.convert({ "x": _EMPTY_DF, "y": _EMPTY_DF }) _check_type_info(type_info, [("x", _empty_array(2)), ("y", _empty_array(2))]) np.testing.assert_array_equal(columns["before"], 123)
def test_columnar_convert_selected_columns_missing(): converter = ColumnarConverter( name="some_name", default_type="foo", type_column=None, column_defaults={}, selected_columns={ "before": "after", "same": "same" }, transform_columns={}, ) with pytest.raises( ValueError, match=r"some_name\['x'\]: expected 'before', 'same' columns, found:" ): converter.convert({"x": _EMPTY_DF})
def test_columnar_convert_invalid_input(): converter = ColumnarConverter( name="some_name", default_type="foo", type_column=None, column_defaults={}, selected_columns={}, transform_columns={}, ) with pytest.raises(TypeError, match="some_name: expected dict, found int"): converter.convert(1) with pytest.raises( TypeError, match= r"some_name\['x'\]: expected IndexedArray or pandas DataFrame, found int", ): converter.convert({"x": 1})
def test_columnar_convert_transform_columns(): columns = {"x": np.complex128(1), "y": np.uint16(2), "z": np.float32(3.0)} dfs = { name: pd.DataFrame({ "s": [0], "t": [1], "w": [w] }, index=[i]) for i, (name, w) in enumerate(columns.items()) } converter = ColumnarConverter( name="some_name", default_type="foo", type_column=None, column_defaults={}, selected_columns={ "s": "ss", "t": "tt", "w": "ww", }, transform_columns={ "w": lambda x: x + 1, }, ) ids, columns, type_info = converter.convert(dfs) assert columns["ww"][0] == 2 assert columns["ww"][1] == 3 assert columns["ww"][2] == 4 _check_type_info( type_info, [("x", _empty_array(1)), ("y", _empty_array(1)), ("z", _empty_array(1))], ) np.testing.assert_array_equal(columns["ss"], 0) np.testing.assert_array_equal(columns["tt"], 1)
def test_columnar_convert_column_default_selected_columns(): # the defaulting happens before the renaming converter = ColumnarConverter( name="x", default_type="foo", type_column=None, column_defaults={"before": 123}, selected_columns={"before": "after"}, transform_columns={}, ) ids, columns, type_info = converter.convert({ "x": _EMPTY_DF, "y": _EMPTY_DF }) _check_type_info(type_info, [("x", _empty_array(2)), ("y", _empty_array(2))]) assert "before" not in columns np.testing.assert_array_equal(columns["after"], 123)
def test_columnar_convert_transform_columns(): columns = {"x": np.complex128(1), "y": np.uint16(2), "z": np.float32(3.0)} dfs = { name: pd.DataFrame({ "s": [0], "t": [1], "w": [w] }, index=[i]) for i, (name, w) in enumerate(columns.items()) } converter = ColumnarConverter( "some_name", float, None, column_defaults={}, selected_columns={ "s": "ss", "t": "tt", "w": "ww", }, transform_columns={ "w": lambda x: x + 1, }, allow_features=False, ) converted, type_starts, _ = converter.convert(dfs) assert ( converted.iloc[type_starts[0][1]:type_starts[1][1]]["ww"] == 2).all() assert (converted[type_starts[1][1]:type_starts[2][1]]["ww"] == 3).all() assert (converted[type_starts[2][1]:]["ww"] == 4).all() assert (converted["ss"] == 0).all() assert (converted["tt"] == 1).all()
def test_columnar_convert_selected_columns(): df = _EMPTY_DF.assign(before="abc", same=10) converter = ColumnarConverter( name="some_name", default_type="foo", type_column=None, column_defaults={}, selected_columns={ "before": "after", "same": "same" }, transform_columns={}, ) ids, columns, type_info = converter.convert({"x": df, "y": df}) np.testing.assert_array_equal(ids, [1, 2, 1, 2]) _check_type_info(type_info, [("x", _empty_array(2)), ("y", _empty_array(2))]) assert "before" not in columns np.testing.assert_array_equal(columns["after"], "abc") np.testing.assert_array_equal(columns["same"], 10)
def test_columnar_convert_ndarray(): converter = ColumnarConverter( "some_name", "foo", None, column_defaults={}, selected_columns={}, transform_columns={}, ) arr1 = np.random.rand(3, 4, 5) arr2 = np.random.rand(6, 7) # single array, default type ids, columns, type_info = converter.convert(arr1) assert ids == range(3) assert columns == {} _check_type_info(type_info, [("foo", arr1)]) assert type_info[0][1] is arr1 # multiple arrays, explicit types; the IDs are wrong (duplicated) here, but that's detected # elsewhere ids, columns, type_info = converter.convert({"a": arr1, "b": arr2}) np.testing.assert_array_equal(ids, [*range(3), *range(6)]) assert columns == {} _check_type_info(type_info, [("a", arr1), ("b", arr2)]) assert type_info[0][1] is arr1 assert type_info[1][1] is arr2 # check it says which type with pytest.raises( ValueError, match=r"some_name\['foo'\]: could not convert NumPy array"): converter.convert(np.zeros(123))
def test_columnar_convert_type_default(): converter = ColumnarConverter("some_name", "foo", {}, {}, False) shared, features = converter.convert(_EMPTY_DF) assert "foo" in shared assert "foo" in features
def test_columnar_convert_type_default(): converter = ColumnarConverter("some_name", "foo", None, {}, {}, False, {}) shared, type_starts, features = converter.convert(_EMPTY_DF) assert type_starts == [("foo", 0)] assert "foo" in features
def test_columnar_convert_disallow_features(): converter = ColumnarConverter("some_name", "foo", None, {}, {}, False, {}) df = _EMPTY_DF.assign(a=1) with pytest.raises(ValueError, match="expected zero feature columns, found 'a'"): shared, type_starts, features = converter.convert(df)