def test_dataset_autogen_feature_names(): labels = [0] eval_dataset2 = EvaluationDataset(data=[list(range(9))], targets=labels) assert eval_dataset2.feature_names == [ f"feature_{i + 1}" for i in range(9) ] eval_dataset2 = EvaluationDataset(data=[list(range(10))], targets=labels) assert eval_dataset2.feature_names == [ f"feature_{i + 1:02d}" for i in range(10) ] eval_dataset2 = EvaluationDataset(data=[list(range(99))], targets=labels) assert eval_dataset2.feature_names == [ f"feature_{i + 1:02d}" for i in range(99) ] eval_dataset2 = EvaluationDataset(data=[list(range(100))], targets=labels) assert eval_dataset2.feature_names == [ f"feature_{i + 1:03d}" for i in range(100) ] with pytest.raises( ValueError, match= "features example rows must be the same length with labels array"): EvaluationDataset(data=[[1, 2], [3, 4]], targets=[1, 2, 3])
def diabetes_spark_dataset(): spark_df = get_diabetes_spark_dataset().sample(fraction=0.3, seed=1) constructor_args = { "data": spark_df, "targets": "label", "name": "diabetes_spark_dataset" } ds = EvaluationDataset(**constructor_args) ds._constructor_args = constructor_args return ds
def breast_cancer_dataset(): X, y = get_breast_cancer_dataset() eval_X, eval_y = X[0::3], y[0::3] constructor_args = { "data": eval_X, "targets": eval_y, "name": "breast_cancer_dataset" } ds = EvaluationDataset(**constructor_args) ds._constructor_args = constructor_args return ds
def diabetes_dataset(): X, y = get_diabetes_dataset() eval_X, eval_y = X[0::3], y[0::3] constructor_args = { "data": eval_X, "targets": eval_y, "name": "diabetes_dataset" } ds = EvaluationDataset(**constructor_args) ds._constructor_args = constructor_args return ds
def test_dataset_metadata(): X, y = get_iris() d1 = EvaluationDataset(data=X, targets=y, name="a1", path="/path/to/a1") assert d1._metadata == { "hash": "6bdf4e119bf1a37e7907dfd9f0e68733", "name": "a1", "path": "/path/to/a1", }
def test_dataset_from_spark_df(spark_session): spark_df = spark_session.createDataFrame([(1.0, 2.0, 3.0)] * 10, ["f1", "f2", "y"]) with mock.patch.object(EvaluationDataset, "SPARK_DATAFRAME_LIMIT", 5): dataset = EvaluationDataset(spark_df, targets="y") assert list(dataset.features_data.columns) == ["f1", "f2"] assert list(dataset.features_data["f1"]) == [1.0] * 5 assert list(dataset.features_data["f2"]) == [2.0] * 5 assert list(dataset.labels_data) == [3.0] * 5
def test_dataset_with_array_data(): features = [[1, 2], [3, 4]] labels = [0, 1] for input_data in [features, np.array(features)]: eval_dataset1 = EvaluationDataset(data=input_data, targets=labels) assert np.array_equal(eval_dataset1.features_data, features) assert np.array_equal(eval_dataset1.labels_data, labels) assert list(eval_dataset1.feature_names) == ["feature_1", "feature_2"] assert EvaluationDataset(data=input_data, targets=labels, feature_names=["a", "b"]).feature_names == ["a", "b"] with pytest.raises(ValueError, match="all element must has the same length"): EvaluationDataset(data=[[1, 2], [3, 4, 5]], targets=labels)
def test_dataset_with_pandas_dataframe(): data = pd.DataFrame({ "f1": [1, 2], "f2": [3, 4], "f3": [5, 6], "label": [0, 1] }) eval_dataset = EvaluationDataset(data=data, targets="label") assert list(eval_dataset.features_data.columns) == ["f1", "f2", "f3"] assert np.array_equal(eval_dataset.features_data.f1.to_numpy(), [1, 2]) assert np.array_equal(eval_dataset.features_data.f2.to_numpy(), [3, 4]) assert np.array_equal(eval_dataset.features_data.f3.to_numpy(), [5, 6]) assert np.array_equal(eval_dataset.labels_data, [0, 1]) eval_dataset2 = EvaluationDataset(data=data, targets="label", feature_names=["f3", "f2"]) assert list(eval_dataset2.features_data.columns) == ["f3", "f2"] assert np.array_equal(eval_dataset2.features_data.f2.to_numpy(), [3, 4]) assert np.array_equal(eval_dataset2.features_data.f3.to_numpy(), [5, 6])
def iris_pandas_df_dataset(): X, y = get_iris() eval_X, eval_y = X[0::3], y[0::3] data = pd.DataFrame({ "f1": eval_X[:, 0], "f2": eval_X[:, 1], "f3": eval_X[:, 2], "f4": eval_X[:, 3], "y": eval_y, }) return EvaluationDataset(data=data, targets="y", name="iris_pandas_df_dataset")
def test_dataset_name(): X, y = get_iris() d1 = EvaluationDataset(data=X, targets=y, name="a1") assert d1.name == "a1" d2 = EvaluationDataset(data=X, targets=y) assert d2.name == d2.hash