示例#1
0
def test_forest_apply_result_shape(data):
    X, y = data
    clf = SimilarityForestClassifier()
    clf.fit(X, y)
    apply_result = clf.apply(X)

    assert apply_result.shape == (X.shape[0], clf.n_estimators)
示例#2
0
def test_probability_values_forest(data):
    X, y = data
    clf = SimilarityForestClassifier()
    clf.fit(X, y)
    preds = clf.predict_proba(X)

    assert_allclose(np.sum(preds, axis=1), np.ones(shape=y.shape))
示例#3
0
def test_similarity_forest_classifier_output_array_shape(data):
    X, y = data
    clf = SimilarityForestClassifier()

    clf.fit(X, y)

    y_pred = clf.predict(X)
    assert y_pred.shape == (X.shape[0], )
示例#4
0
def test_setting_attributes_forest(data):
    X, y = data
    clf = SimilarityForestClassifier(random_state=42, n_directions=2)
    clf.fit(X, y)
    y_pred = clf.predict(X)

    assert clf.random_state == 42
    assert clf.n_directions == 2
示例#5
0
def test_train_set_acc(data):
    X, y = data

    forest = SimilarityForestClassifier()
    forest.fit(X, y)
    # shouldn't be actually 1.0?
    assert forest.score(X, y) > 0.8

    tree = SimilarityTreeClassifier()
    tree.fit(X, y)
    assert tree.score(X, y) > 0.9
示例#6
0
def test_similarity_forest_outliers_ranking_stability(data):
    """There should not be a situation when models predicts the same when there is no random_state set"""
    X, y = data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=42)

    clf = SimilarityForestClassifier()
    clf.fit(X_train, y_train)
    '''rcorrelations = clf.outliers_rank_stability(X_test, plot=False)
示例#7
0
def test_deterministic_predictions_forest():
    X, y = make_blobs(n_samples=300, centers=[(0, 0), (1, 1)], random_state=42)

    clf1 = SimilarityForestClassifier(random_state=42)
    clf1.fit(X, y)
    clf2 = SimilarityForestClassifier(random_state=42)
    clf2.fit(X, y)

    y_pred1 = clf1.predict(X)
    y_pred2 = clf2.predict(X)
    assert_array_equal(y_pred1, y_pred2)
示例#8
0
def test_similarity_forest_classifier_prediction(data):
    X, y = data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=42)

    clf = SimilarityForestClassifier()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    assert y_pred.shape == (X_test.shape[0], )
    assert accuracy_score(y_test, y_pred) > 0.9
示例#9
0
def test_similarity_forest_wrongly_the_same_pred(data):
    """There should not be a situation when models predicts the same when there is no random_state set"""
    X, y = data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=42)

    clf1 = SimilarityForestClassifier()
    clf1.fit(X_train, y_train)
    y_pred1 = clf1.predict_proba(X_test)

    clf2 = SimilarityForestClassifier()
    clf2.fit(X_train, y_train)
    y_pred2 = clf2.predict_proba(X_test)

    assert not np.array_equal(y_pred1, y_pred2)
示例#10
0
def test_log_probabilities_forest(data):
    X, y = data
    clf = SimilarityForestClassifier()
    clf.fit(X, y)
    preds = clf.predict_proba(X)
    log_preds = clf.predict_log_proba(X)

    assert_allclose(log_preds, np.log(preds + 1e-10))
for d_idx, d in enumerate(get_datasets()):
    X_train, X_test, y_train, y_test, dataset = d

    if binary:
        scorer = 'f1'
    else:
        scorer = 'f1_weighted'

    # Find parameters
    # RF
    rf = GridSearchCV(RandomForestClassifier(), param_grid=rf_params, cv=5, scoring=scorer, refit=scorer, n_jobs=4)
    rf.fit(X_train, y_train)

    # SF
    if multiclass_strategy:
        ovr = OneVsRestClassifier(SimilarityForestClassifier())
        sf = GridSearchCV(ovr, param_grid=ovr_sf_params, cv=n_folds, verbose=5, n_jobs=4, scoring='f1', refit='f1')
    else:
        sf = GridSearchCV(SimilarityForestClassifier(), param_grid=sf_params, cv=n_folds, verbose=5, n_jobs=4, scoring=scorer, refit=scorer)

    sf.fit(X_train, y_train)

    # Log std of CV scores for different parameters
    rf_std_score = np.std(rf.cv_results_['mean_test_score'])
    if use_neptune:
        neptune.log_metric(f'{dataset} RF std cv score', rf_std_score)

    sf_std_score = np.std(sf.cv_results_['mean_test_score'])
    if use_neptune:
        neptune.log_metric(f'{dataset} SF std cv score', sf_std_score)