def test_forest_apply_result_shape(data): X, y = data clf = SimilarityForestClassifier() clf.fit(X, y) apply_result = clf.apply(X) assert apply_result.shape == (X.shape[0], clf.n_estimators)
def test_probability_values_forest(data): X, y = data clf = SimilarityForestClassifier() clf.fit(X, y) preds = clf.predict_proba(X) assert_allclose(np.sum(preds, axis=1), np.ones(shape=y.shape))
def test_similarity_forest_classifier_output_array_shape(data): X, y = data clf = SimilarityForestClassifier() clf.fit(X, y) y_pred = clf.predict(X) assert y_pred.shape == (X.shape[0], )
def test_setting_attributes_forest(data): X, y = data clf = SimilarityForestClassifier(random_state=42, n_directions=2) clf.fit(X, y) y_pred = clf.predict(X) assert clf.random_state == 42 assert clf.n_directions == 2
def test_train_set_acc(data): X, y = data forest = SimilarityForestClassifier() forest.fit(X, y) # shouldn't be actually 1.0? assert forest.score(X, y) > 0.8 tree = SimilarityTreeClassifier() tree.fit(X, y) assert tree.score(X, y) > 0.9
def test_similarity_forest_outliers_ranking_stability(data): """There should not be a situation when models predicts the same when there is no random_state set""" X, y = data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) clf = SimilarityForestClassifier() clf.fit(X_train, y_train) '''rcorrelations = clf.outliers_rank_stability(X_test, plot=False)
def test_deterministic_predictions_forest(): X, y = make_blobs(n_samples=300, centers=[(0, 0), (1, 1)], random_state=42) clf1 = SimilarityForestClassifier(random_state=42) clf1.fit(X, y) clf2 = SimilarityForestClassifier(random_state=42) clf2.fit(X, y) y_pred1 = clf1.predict(X) y_pred2 = clf2.predict(X) assert_array_equal(y_pred1, y_pred2)
def test_similarity_forest_classifier_prediction(data): X, y = data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) clf = SimilarityForestClassifier() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) assert y_pred.shape == (X_test.shape[0], ) assert accuracy_score(y_test, y_pred) > 0.9
def test_similarity_forest_wrongly_the_same_pred(data): """There should not be a situation when models predicts the same when there is no random_state set""" X, y = data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) clf1 = SimilarityForestClassifier() clf1.fit(X_train, y_train) y_pred1 = clf1.predict_proba(X_test) clf2 = SimilarityForestClassifier() clf2.fit(X_train, y_train) y_pred2 = clf2.predict_proba(X_test) assert not np.array_equal(y_pred1, y_pred2)
def test_log_probabilities_forest(data): X, y = data clf = SimilarityForestClassifier() clf.fit(X, y) preds = clf.predict_proba(X) log_preds = clf.predict_log_proba(X) assert_allclose(log_preds, np.log(preds + 1e-10))
for d_idx, d in enumerate(get_datasets()): X_train, X_test, y_train, y_test, dataset = d if binary: scorer = 'f1' else: scorer = 'f1_weighted' # Find parameters # RF rf = GridSearchCV(RandomForestClassifier(), param_grid=rf_params, cv=5, scoring=scorer, refit=scorer, n_jobs=4) rf.fit(X_train, y_train) # SF if multiclass_strategy: ovr = OneVsRestClassifier(SimilarityForestClassifier()) sf = GridSearchCV(ovr, param_grid=ovr_sf_params, cv=n_folds, verbose=5, n_jobs=4, scoring='f1', refit='f1') else: sf = GridSearchCV(SimilarityForestClassifier(), param_grid=sf_params, cv=n_folds, verbose=5, n_jobs=4, scoring=scorer, refit=scorer) sf.fit(X_train, y_train) # Log std of CV scores for different parameters rf_std_score = np.std(rf.cv_results_['mean_test_score']) if use_neptune: neptune.log_metric(f'{dataset} RF std cv score', rf_std_score) sf_std_score = np.std(sf.cv_results_['mean_test_score']) if use_neptune: neptune.log_metric(f'{dataset} SF std cv score', sf_std_score)