示例#1
0
def test_dataset_autogen_feature_names():
    labels = [0]
    eval_dataset2 = EvaluationDataset(data=[list(range(9))], targets=labels)
    assert eval_dataset2.feature_names == [
        f"feature_{i + 1}" for i in range(9)
    ]

    eval_dataset2 = EvaluationDataset(data=[list(range(10))], targets=labels)
    assert eval_dataset2.feature_names == [
        f"feature_{i + 1:02d}" for i in range(10)
    ]

    eval_dataset2 = EvaluationDataset(data=[list(range(99))], targets=labels)
    assert eval_dataset2.feature_names == [
        f"feature_{i + 1:02d}" for i in range(99)
    ]

    eval_dataset2 = EvaluationDataset(data=[list(range(100))], targets=labels)
    assert eval_dataset2.feature_names == [
        f"feature_{i + 1:03d}" for i in range(100)
    ]

    with pytest.raises(
            ValueError,
            match=
            "features example rows must be the same length with labels array"):
        EvaluationDataset(data=[[1, 2], [3, 4]], targets=[1, 2, 3])
示例#2
0
def diabetes_spark_dataset():
    spark_df = get_diabetes_spark_dataset().sample(fraction=0.3, seed=1)
    constructor_args = {
        "data": spark_df,
        "targets": "label",
        "name": "diabetes_spark_dataset"
    }
    ds = EvaluationDataset(**constructor_args)
    ds._constructor_args = constructor_args
    return ds
示例#3
0
def breast_cancer_dataset():
    X, y = get_breast_cancer_dataset()
    eval_X, eval_y = X[0::3], y[0::3]
    constructor_args = {
        "data": eval_X,
        "targets": eval_y,
        "name": "breast_cancer_dataset"
    }
    ds = EvaluationDataset(**constructor_args)
    ds._constructor_args = constructor_args
    return ds
示例#4
0
def diabetes_dataset():
    X, y = get_diabetes_dataset()
    eval_X, eval_y = X[0::3], y[0::3]
    constructor_args = {
        "data": eval_X,
        "targets": eval_y,
        "name": "diabetes_dataset"
    }
    ds = EvaluationDataset(**constructor_args)
    ds._constructor_args = constructor_args
    return ds
示例#5
0
def test_dataset_metadata():
    X, y = get_iris()
    d1 = EvaluationDataset(data=X, targets=y, name="a1", path="/path/to/a1")
    assert d1._metadata == {
        "hash": "6bdf4e119bf1a37e7907dfd9f0e68733",
        "name": "a1",
        "path": "/path/to/a1",
    }
示例#6
0
def test_dataset_from_spark_df(spark_session):
    spark_df = spark_session.createDataFrame([(1.0, 2.0, 3.0)] * 10,
                                             ["f1", "f2", "y"])
    with mock.patch.object(EvaluationDataset, "SPARK_DATAFRAME_LIMIT", 5):
        dataset = EvaluationDataset(spark_df, targets="y")
        assert list(dataset.features_data.columns) == ["f1", "f2"]
        assert list(dataset.features_data["f1"]) == [1.0] * 5
        assert list(dataset.features_data["f2"]) == [2.0] * 5
        assert list(dataset.labels_data) == [3.0] * 5
示例#7
0
def test_dataset_with_array_data():
    features = [[1, 2], [3, 4]]
    labels = [0, 1]

    for input_data in [features, np.array(features)]:
        eval_dataset1 = EvaluationDataset(data=input_data, targets=labels)
        assert np.array_equal(eval_dataset1.features_data, features)
        assert np.array_equal(eval_dataset1.labels_data, labels)
        assert list(eval_dataset1.feature_names) == ["feature_1", "feature_2"]

    assert EvaluationDataset(data=input_data,
                             targets=labels,
                             feature_names=["a",
                                            "b"]).feature_names == ["a", "b"]

    with pytest.raises(ValueError,
                       match="all element must has the same length"):
        EvaluationDataset(data=[[1, 2], [3, 4, 5]], targets=labels)
示例#8
0
def test_dataset_with_pandas_dataframe():
    data = pd.DataFrame({
        "f1": [1, 2],
        "f2": [3, 4],
        "f3": [5, 6],
        "label": [0, 1]
    })
    eval_dataset = EvaluationDataset(data=data, targets="label")

    assert list(eval_dataset.features_data.columns) == ["f1", "f2", "f3"]
    assert np.array_equal(eval_dataset.features_data.f1.to_numpy(), [1, 2])
    assert np.array_equal(eval_dataset.features_data.f2.to_numpy(), [3, 4])
    assert np.array_equal(eval_dataset.features_data.f3.to_numpy(), [5, 6])
    assert np.array_equal(eval_dataset.labels_data, [0, 1])

    eval_dataset2 = EvaluationDataset(data=data,
                                      targets="label",
                                      feature_names=["f3", "f2"])
    assert list(eval_dataset2.features_data.columns) == ["f3", "f2"]
    assert np.array_equal(eval_dataset2.features_data.f2.to_numpy(), [3, 4])
    assert np.array_equal(eval_dataset2.features_data.f3.to_numpy(), [5, 6])
示例#9
0
def iris_pandas_df_dataset():
    X, y = get_iris()
    eval_X, eval_y = X[0::3], y[0::3]
    data = pd.DataFrame({
        "f1": eval_X[:, 0],
        "f2": eval_X[:, 1],
        "f3": eval_X[:, 2],
        "f4": eval_X[:, 3],
        "y": eval_y,
    })
    return EvaluationDataset(data=data,
                             targets="y",
                             name="iris_pandas_df_dataset")
示例#10
0
def test_dataset_name():
    X, y = get_iris()
    d1 = EvaluationDataset(data=X, targets=y, name="a1")
    assert d1.name == "a1"
    d2 = EvaluationDataset(data=X, targets=y)
    assert d2.name == d2.hash