Exemplo n.º 1
0
    def test_load_train_and_test_columns_dont_intersect(temp_file_pair):
        tmp_train, tmp_test = temp_file_pair
        _make_and_write_data(tmp_train,
                             100,
                             19,
                             True,
                             True,
                             0,
                             column_prefix="A")
        _make_and_write_data(tmp_test,
                             20,
                             11,
                             True,
                             True,
                             0,
                             column_prefix="B")

        with pytest.raises(
                ValueError,
                match="columns of training and test data do not intersect"):
            sdata.load_arff_files_standardized(tmp_train.name,
                                               ["event", "time"],
                                               1,
                                               path_testing=tmp_test.name,
                                               survival=True,
                                               standardize_numeric=False,
                                               to_numeric=False)
Exemplo n.º 2
0
    def test_load_train_and_test_with_different_columns(temp_file_pair):
        tmp_train, tmp_test = temp_file_pair
        _make_and_write_data(tmp_train, 100, 19, False, True, 0)
        _make_and_write_data(tmp_test, 20, 11, False, True, 0)

        with pytest.warns(UserWarning,
                          match="Restricting columns to intersection between "
                                "training and testing data"):
            sdata.load_arff_files_standardized(tmp_train.name, ["event", "time"], 1,
                                               path_testing=tmp_test.name,
                                               survival=True,
                                               standardize_numeric=False, to_numeric=False)
Exemplo n.º 3
0
    def test_load_with_categorical_index_2(arff_2):
        x_train, y_train, x_test, y_test = sdata.load_arff_files_standardized(
            arff_2, ["label"], pos_label="yes", survival=False,
            standardize_numeric=False, to_numeric=False)

        assert x_test is None
        assert y_test is None

        assert x_train.shape == (5, 2)
        assert y_train.shape == (5, 1)

        index = pandas.Index(['ASampleOne', 'ASampleTwo', 'ASampleThree', 'ASampleFour', 'ASampleFive'],
                             name='index', dtype=object)
        tm.assert_index_equal(x_train.index, index, exact=True)

        label = pandas.Series(pandas.Categorical(["no", "no", "yes", "yes", "no"],
                                                 categories=["yes", "no"], ordered=False),
                              name="label", index=index)
        tm.assert_series_equal(y_train["label"], label, check_exact=True)

        value = pandas.Series([1.51, 1.38, -20, 245.3, 3.14], name="value", index=index)
        tm.assert_series_equal(x_train["value"], value, check_exact=True)

        size = pandas.Series(pandas.Categorical(["small", "small", "large", "small", "large"],
                                                categories=["small", "medium", "large"], ordered=False),
                             name="size", index=index)
        tm.assert_series_equal(x_train["size"], size, check_exact=True)
Exemplo n.º 4
0
    def test_load_with_index(temp_file):
        dataset = _make_and_write_data(temp_file, 100, 10, True, True, 0)

        x_train, y_train, x_test, y_test = sdata.load_arff_files_standardized(
            temp_file.name, ["event", "time"], 1, survival=True,
            standardize_numeric=False, to_numeric=False)

        assert x_test is None
        assert y_test is None

        cols = ["event", "time"]
        x_true = dataset.drop(cols, axis=1)

        assert_x_equal(x_true, x_train)
        assert_y_equal(dataset, y_train)
Exemplo n.º 5
0
    def test_load_train_and_test_no_labels(temp_file_pair):
        tmp_train, tmp_test = temp_file_pair
        train_dataset = _make_and_write_data(tmp_train, 100, 10, True, True, 0)
        test_dataset = _make_and_write_data(tmp_test, 20, 10, True, False, 0)

        x_train, y_train, x_test, y_test = sdata.load_arff_files_standardized(
            tmp_train.name, ["event", "time"], 1, path_testing=tmp_test.name,
            survival=True, standardize_numeric=False, to_numeric=False)

        cols = ["event", "time"]

        x_true = train_dataset.drop(cols, axis=1)
        assert_x_equal(x_true, x_train)
        assert_y_equal(train_dataset, y_train)

        assert_x_equal(test_dataset, x_test)
        assert y_test is None
Exemplo n.º 6
0
    def test_load_train_and_test_with_categorical_index(arff_1, arff_2):
        x_train, y_train, x_test, y_test = sdata.load_arff_files_standardized(
            arff_1, ["label"], pos_label="yes", path_testing=arff_2, survival=False,
            standardize_numeric=False, to_numeric=False)

        assert x_train.shape == (4, 2)
        assert x_test.shape == (5, 2)
        assert y_train.shape == (4, 1)
        assert y_test.shape == (5, 1)

        # Check train data
        train_index = pandas.Index(['SampleOne', 'SampleTwo', 'SampleThree', 'SampleFour'],
                                   name='index', dtype=object)
        tm.assert_index_equal(x_train.index, train_index, exact=True)

        train_label = pandas.Series(
            pandas.Categorical(["yes", "no", "yes", "yes"], categories=["no", "yes"], ordered=False),
            name="label", index=train_index)
        tm.assert_series_equal(y_train["label"], train_label, check_exact=True)

        train_value = pandas.Series([15.1, 13.8, -0.2, 2.453], name="value", index=train_index)
        tm.assert_series_equal(x_train["value"], train_value, check_exact=True)

        train_size = pandas.Series(pandas.Categorical(["medium", "large", "small", "large"],
                                                      categories=["small", "medium", "large"], ordered=False),
                                   name="size", index=train_index)
        tm.assert_series_equal(x_train["size"], train_size, check_exact=True)

        # Check test data
        test_index = pandas.Index(['ASampleOne', 'ASampleTwo', 'ASampleThree', 'ASampleFour', 'ASampleFive'],
                                  name='index', dtype=object)
        tm.assert_index_equal(x_test.index, test_index, exact=True)

        test_label = pandas.Series(
            pandas.Categorical(["no", "no", "yes", "yes", "no"], categories=["yes", "no"], ordered=False),
            name="label", index=test_index)
        tm.assert_series_equal(y_test["label"], test_label, check_exact=True)

        test_value = pandas.Series([1.51, 1.38, -20, 245.3, 3.14], name="value", index=test_index)
        tm.assert_series_equal(x_test["value"], test_value, check_exact=True)

        test_size = pandas.Series(pandas.Categorical(["small", "small", "large", "small", "large"],
                                                     categories=["small", "medium", "large"], ordered=False),
                                  name="size", index=test_index)
        tm.assert_series_equal(x_test["size"], test_size, check_exact=True)