def test_performances(): X, y = make_blobs(n_samples=1000, random_state=0, centers=2) # make labels imbalanced by remove all but 100 instances from class 1 indexes = np.ones(X.shape[0]).astype(bool) ind = np.array([False] * 100 + list(((y == 1)[100:]))) indexes[ind] = 0 X = X[indexes] y = y[indexes] n_samples, n_features = X.shape clf = FraudToRules() # fit clf.fit(X, y) # with lists clf.fit(X.tolist(), y.tolist()) y_pred = clf.predict(X) assert_equal(y_pred.shape, (n_samples, )) # training set performance assert_greater(accuracy_score(y, y_pred), 0.83) # decision_function agrees with predict decision = -clf.decision_function(X) assert_equal(decision.shape, (n_samples, )) dec_pred = (decision.ravel() < 0).astype(np.int) assert_array_equal(dec_pred, y_pred)
def test_fraudetorules_works(): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]] y = [0] * 6 + [1] * 2 X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5], [5, -7]] # Test LOF clf = FraudToRules(random_state=rng, max_samples=1.) clf.fit(X, y) decision_func = -clf.decision_function(X_test) pred = clf.predict(X_test) # assert detect outliers: assert_greater(np.max(decision_func[:-2]), np.min(decision_func[-2:])) assert_array_equal(pred, 6 * [0] + 2 * [1])
# Generate train data I = 0.5 * rng.randn(int(n_inliers / 2), 2) X_inliers = np.r_[I + 2, I - 2] O = 0.5 * rng.randn(n_outliers, 2) X_outliers = O # np.r_[O, O + [2, -2]] X_train = np.r_[X_inliers, X_outliers] y_train = [0] * n_inliers + [1] * n_outliers # fit the model clf = FraudToRules(random_state=rng, n_estimators=100) clf.fit(X_train, y_train) # plot the line, the samples, and the nearest vectors to the plane xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50)) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title("Fraud To Rules") plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) a = plt.scatter(X_inliers[:, 0], X_inliers[:, 1], c='white', s=20, edgecolor='k') b = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k')
y_test = target[n_samples_train:] X_train = data[:n_samples_train] X_test = data[n_samples_train:] # fit the model clf = FraudToRules(max_depth=2, max_features=0.5, max_samples_features=0.5, random_state=rng, n_estimators=10, feature_names=feature_names) clf.fit(X_train, y_train) RF = RandomForestClassifier() RF.fit(X_train, y_train) scoring = clf.decision_function(X_test) scoring_RF = RF.predict_proba(X_test)[:, 1] scoring_one_rule = np.zeros(X_test.shape[0]) rule = clf.rules_[0][0] detected_index = list( pd.DataFrame(X_test, columns=feature_names).query(rule).index) scoring_one_rule[detected_index] = 1 print('best rule precision:', y_test[detected_index].mean()) # XXX add semi-weighted PR in some utils dir and plot it too? fig, axes = plt.subplots(2, 2, figsize=(20, 10), sharex=True, sharey=True) curves = [roc_curve, precision_recall_curve] * 2 scores = [scoring] * 2 + [scoring_one_rule] * 2 xlabels = ['FPR', 'Recall'] * 2
import pdb pdb.set_trace() X = X.astype(float) X_train = X[:n_samples_train, :] X_test = X[n_samples_train:, :] y_train = y[:n_samples_train] y_test = y[n_samples_train:] print('--- Fitting the FraudToRules estimator...') model = FraudToRules(n_estimators=5, max_depth=5, n_jobs=-1) tstart = time() model.fit(X_train, y_train) fit_time = time() - tstart tstart = time() scoring = -model.decision_function(X_test) # the lower, the more abnormal print("--- Preparing the plot elements...") if with_decision_function_histograms: fig, ax = plt.subplots(3, sharex=True, sharey=True) bins = np.linspace(-0.5, 0.5, 200) ax[0].hist(scoring, bins, color='black') ax[0].set_title('Decision function for %s dataset' % dat) ax[1].hist(scoring[y_test == 0], bins, color='b', label='normal data') ax[1].legend(loc="lower right") ax[2].hist(scoring[y_test == 1], bins, color='r', label='outliers') ax[2].legend(loc="lower right") # Show ROC Curves predict_time = time() - tstart fpr, tpr, thresholds = roc_curve(y_test, scoring)