def test_performances(): X, y = make_blobs(n_samples=1000, random_state=0, centers=2) # make labels imbalanced by remove all but 100 instances from class 1 indexes = np.ones(X.shape[0]).astype(bool) ind = np.array([False] * 100 + list(((y == 1)[100:]))) indexes[ind] = 0 X = X[indexes] y = y[indexes] n_samples, n_features = X.shape clf = FraudToRules() # fit clf.fit(X, y) # with lists clf.fit(X.tolist(), y.tolist()) y_pred = clf.predict(X) assert_equal(y_pred.shape, (n_samples, )) # training set performance assert_greater(accuracy_score(y, y_pred), 0.83) # decision_function agrees with predict decision = -clf.decision_function(X) assert_equal(decision.shape, (n_samples, )) dec_pred = (decision.ravel() < 0).astype(np.int) assert_array_equal(dec_pred, y_pred)
def test_max_samples_attribute(): X = iris.data y = iris.target y = (y != 0) clf = FraudToRules(max_samples=1.).fit(X, y) assert_equal(clf.max_samples_, X.shape[0]) clf = FraudToRules(max_samples=500) assert_warns_message( UserWarning, "max_samples will be set to n_samples for estimation", clf.fit, X, y) assert_equal(clf.max_samples_, X.shape[0]) clf = FraudToRules(max_samples=0.4).fit(X, y) assert_equal(clf.max_samples_, 0.4 * X.shape[0])
def test_fraudetorules(): """Check various parameter settings.""" X_train = np.array([[0, 1], [1, 2]]) y_train = np.array([0, 1]) X_test = np.array([[2, 1], [1, 1]]) grid = ParameterGrid({ "feature_names": [None, ['a', 'b']], "precision_min": [0.1], "recall_min": [0.1], "n_estimators": [1], "max_samples": [0.5, 3], "max_samples_features": [0.5, 2], "bootstrap": [True, False], "bootstrap_features": [True, False], "max_depth": [2], "max_features": ["auto", 1, 0.1], "min_samples_split": [2, 0.1], "n_jobs": [-1, 1] }) with ignore_warnings(): for params in grid: FraudToRules(random_state=rng, **params).fit(X_train, y_train).predict(X_test)
def test_fraudetorules_error(): """Test that it gives proper exception on deficient input.""" X = iris.data y = iris.target y = (y != 0) # Test max_samples assert_raises(ValueError, FraudToRules(max_samples=-1).fit, X, y) assert_raises(ValueError, FraudToRules(max_samples=0.0).fit, X, y) assert_raises(ValueError, FraudToRules(max_samples=2.0).fit, X, y) # explicitly setting max_samples > n_samples should result in a warning. assert_warns_message( UserWarning, "max_samples will be set to n_samples for estimation", FraudToRules(max_samples=1000).fit, X, y) assert_no_warnings(FraudToRules(max_samples=np.int64(2)).fit, X, y) assert_raises(ValueError, FraudToRules(max_samples='foobar').fit, X, y) assert_raises(ValueError, FraudToRules(max_samples=1.5).fit, X, y) assert_raises(ValueError, FraudToRules().fit(X, y).predict, X[:, 1:])
def test_fraudetorules_works(): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]] y = [0] * 6 + [1] * 2 X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5], [5, -7]] # Test LOF clf = FraudToRules(random_state=rng, max_samples=1.) clf.fit(X, y) decision_func = -clf.decision_function(X_test) pred = clf.predict(X_test) # assert detect outliers: assert_greater(np.max(decision_func[:-2]), np.min(decision_func[-2:])) assert_array_equal(pred, 6 * [0] + 2 * [1])
rng = np.random.RandomState(42) n_inliers = 1000 n_outliers = 50 # Generate train data I = 0.5 * rng.randn(int(n_inliers / 2), 2) X_inliers = np.r_[I + 2, I - 2] O = 0.5 * rng.randn(n_outliers, 2) X_outliers = O # np.r_[O, O + [2, -2]] X_train = np.r_[X_inliers, X_outliers] y_train = [0] * n_inliers + [1] * n_outliers # fit the model clf = FraudToRules(random_state=rng, n_estimators=100) clf.fit(X_train, y_train) # plot the line, the samples, and the nearest vectors to the plane xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50)) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title("Fraud To Rules") plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) a = plt.scatter(X_inliers[:, 0], X_inliers[:, 1], c='white', s=20, edgecolor='k')
feature_names[4] = 'accepteur_ZBBIOHSD' data.columns = feature_names print(feature_names) data = data.values n_samples = data.shape[0] n_samples_train = int(n_samples / 2) y_train = target[:n_samples_train] y_test = target[n_samples_train:] X_train = data[:n_samples_train] X_test = data[n_samples_train:] # fit the model clf = FraudToRules(max_depth=2, max_features=0.5, max_samples_features=0.5, random_state=rng, n_estimators=10, feature_names=feature_names) clf.fit(X_train, y_train) RF = RandomForestClassifier() RF.fit(X_train, y_train) scoring = clf.decision_function(X_test) scoring_RF = RF.predict_proba(X_test)[:, 1] scoring_one_rule = np.zeros(X_test.shape[0]) rule = clf.rules_[0][0] detected_index = list( pd.DataFrame(X_test, columns=feature_names).query(rule).index) scoring_one_rule[detected_index] = 1 print('best rule precision:', y_test[detected_index].mean())
if dat in ('http', 'smtp'): y = (y != b'normal.').astype(int) print_outlier_ratio(y) n_samples, n_features = X.shape n_samples_train = n_samples // 2 import pdb pdb.set_trace() X = X.astype(float) X_train = X[:n_samples_train, :] X_test = X[n_samples_train:, :] y_train = y[:n_samples_train] y_test = y[n_samples_train:] print('--- Fitting the FraudToRules estimator...') model = FraudToRules(n_estimators=5, max_depth=5, n_jobs=-1) tstart = time() model.fit(X_train, y_train) fit_time = time() - tstart tstart = time() scoring = -model.decision_function(X_test) # the lower, the more abnormal print("--- Preparing the plot elements...") if with_decision_function_histograms: fig, ax = plt.subplots(3, sharex=True, sharey=True) bins = np.linspace(-0.5, 0.5, 200) ax[0].hist(scoring, bins, color='black') ax[0].set_title('Decision function for %s dataset' % dat) ax[1].hist(scoring[y_test == 0], bins, color='b', label='normal data') ax[1].legend(loc="lower right")