def test_can_predict(): clf = SkopeRules(regression=True, max_depth_duplication=2, n_estimators=30, precision_min=0.20, recall_min=0.20, feature_names=feature_names) clf.fit(X, y) clf.predict(X)
def test_performance_not_deteriorate(): '''Compare the model performance to baselines. It's a bit unclear what to compare against since performance varies widely across models (in mse; vanilla settings): decision tree regressor: 6946 random forest regressor: 2970 linear model: 2820 ''' clf = SkopeRules(regression=True, max_depth_duplication=None, max_depth=X.shape[1] // 3, n_estimators=850, precision_min=0., recall_min=.0, feature_names=feature_names) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) mse = mean_squared_error(y_test, y_pred) # comparing to a baseline from linear regression: assert mse < 2820
def test_performances(): X, y = make_blobs(n_samples=1000, random_state=0, centers=2) # make labels imbalanced by remove all but 100 instances from class 1 indexes = np.ones(X.shape[0]).astype(bool) ind = np.array([False] * 100 + list(((y == 1)[100:]))) indexes[ind] = 0 X = X[indexes] y = y[indexes] n_samples, n_features = X.shape clf = SkopeRules() # fit clf.fit(X, y) # with lists clf.fit(X.tolist(), y.tolist()) y_pred = clf.predict(X) assert_equal(y_pred.shape, (n_samples, )) # training set performance assert_greater(accuracy_score(y, y_pred), 0.83) # decision_function agrees with predict decision = -clf.decision_function(X) assert_equal(decision.shape, (n_samples, )) dec_pred = (decision.ravel() < 0).astype(np.int) assert_array_equal(dec_pred, y_pred)
def test_deduplication_works(): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]] y = [0] * 6 + [1] * 2 X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5], [5, -7]] # Test LOF clf = SkopeRules(random_state=rng, max_samples=1., max_depth_duplication=3) clf.fit(X, y) decision_func = clf.decision_function(X_test) rules_vote = clf.rules_vote(X_test) score_top_rules = clf.score_top_rules(X_test) pred = clf.predict(X_test) pred_score_top_rules = clf.predict_top_rules(X_test, 1)
def KfoldAcc(X, y, multiclass=False, k=10): #then oversample pos kf = KFold(n_splits=10, shuffle=True) accuracy = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #test set neigh = KNeighborsClassifier() neigh.fit(X_train, y_train) neigh_y_pred = neigh.predict(X_test) accuracy['neigh'].append( accuracy_score(y_test, neigh_y_pred, normalize=True, sample_weight=None)) print('---------') tree = DecisionTreeClassifier() tree.fit(X_train, y_train) tree_y_pred = tree.predict(X_test) accuracy['tree'].append( accuracy_score(y_test, tree_y_pred, normalize=True, sample_weight=None)) print('---------') naive = GaussianNB() naive.fit(X_train, y_train) naive_y_pred = naive.predict(X_test) accuracy['naive'].append( accuracy_score(y_test, naive_y_pred, normalize=True, sample_weight=None)) print('---------') rule = SkopeRules() if multiclass is True: rule = OneVsRestClassifier(rule) rule.fit(X_train, y_train) rules_y_pred = rule.predict(X_test) accuracy['rule'].append( accuracy_score(y_test, rules_y_pred, normalize=True, sample_weight=None)) return accuracy
def test_skope_rules_works(): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]] y = [0] * 6 + [1] * 2 X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5], [5, -7]] # Test LOF clf = SkopeRules(random_state=rng, max_samples=1.) clf.fit(X, y) decision_func = clf.decision_function(X_test) rules_vote = clf.rules_vote(X_test) separate_rules_score = clf.separate_rules_score(X_test) pred = clf.predict(X_test) # assert detect outliers: assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2])) assert_greater(np.min(rules_vote[-2:]), np.max(rules_vote[:-2])) assert_greater(np.min(separate_rules_score[-2:]), np.max(separate_rules_score[:-2])) assert_array_equal(pred, 6 * [0] + 2 * [1])
rule = SkopeRules(feature_names= data.columns.to_list()[0:18]) neigh.fit(X_train, y_train) y_pred = neigh.predict(X_test) accuracy_no['neigh'].append(accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)) tree.fit(X_train, y_train) y_pred = tree.predict(X_test) accuracy_no['tree'].append(accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)) naive.fit(X_train, y_train) y_pred = naive.predict(X_test) accuracy_no['naive'].append(accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)) rule.fit(X_train, y_train) y_pred = rule.predict(X_test) accuracy_no['rule'].append(accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)) #feature sel w/ boruta for train_index, test_index in kf.split(X1): X_train, X_test = X1[train_index], X1[test_index] y_train, y_test = y_bal[train_index], y_bal[test_index] #test set neigh = KNeighborsClassifier() ## tree = DecisionTreeClassifier() naive = GaussianNB() rule = SkopeRules() neigh.fit(X_train, y_train) y_pred = neigh.predict(X_test)
def OverSampleKfold(data, k=10): data_np = data.to_numpy() X = data_np[:, 0:18] y = data_np[:, 18] sm = SMOTE() kf = KFold(n_splits=k, shuffle=True) accuracy = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} recall = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} auc = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #test set print('num of pos:', np.sum(y_train), ', num of neg:', y_train.size - np.sum(y_train)) X_over, y_over = sm.fit_resample(X_train, y_train) #oversampled train set print('oversample:', X_over.shape, y_over.shape) print('---------------------------------------') neigh = KNeighborsClassifier() neigh.fit(X_over, y_over) neigh_y_pred = neigh.predict(X_test) neigh_y_score = neigh.predict_proba(X_test)[:, 1] #nei scorer recall['neigh'].append( recall_score(y_test, neigh_y_pred, labels=None, pos_label=1)) accuracy['neigh'].append( accuracy_score(y_test, neigh_y_pred, normalize=True, sample_weight=None)) auc['neigh'].append(roc_auc_score(y_test, neigh_y_score)) print('---------') tree = DecisionTreeClassifier() tree.fit(X_over, y_over) tree_y_pred = tree.predict(X_test) tree_y_score = tree.predict_proba(X_test)[:, 1] #nei scorer recall['tree'].append( recall_score(y_test, tree_y_pred, labels=None, pos_label=1)) accuracy['tree'].append( accuracy_score(y_test, tree_y_pred, normalize=True, sample_weight=None)) auc['tree'].append(roc_auc_score(y_test, tree_y_score)) print('---------') naive = GaussianNB() naive.fit(X_over, y_over) naive_y_pred = naive.predict(X_test) naive_y_score = naive.predict_proba(X_test)[:, 1] #nei scorer recall['naive'].append( recall_score(y_test, naive_y_pred, labels=None, pos_label=1)) accuracy['naive'].append( accuracy_score(y_test, naive_y_pred, normalize=True, sample_weight=None)) auc['naive'].append(roc_auc_score(y_test, naive_y_score)) print('---------') rule = SkopeRules(feature_names=data.columns.to_list()[0:18]) rule.fit(X_over, y_over) rules_y_pred = rule.predict(X_test) rule_y_score = rule.rules_vote(X_test) recall['rule'].append( recall_score(y_test, rules_y_pred, labels=None, pos_label=1)) accuracy['rule'].append( accuracy_score(y_test, rules_y_pred, normalize=True, sample_weight=None)) auc['rule'].append(roc_auc_score(y_test, rule_y_score)) return accuracy, recall, auc
def BalanceSampleKfold(data, k=10): data_np = data.to_numpy() X = data_np[:, 0:18] y = data_np[:, 18] rus_balance = RandomUnderSampler( sampling_strategy=0.20) #truncate neg to 5*#pos sm_balance = SMOTE() #then oversample pos kf = KFold(n_splits=10, shuffle=True) accuracy = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} recall = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} auc = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #test set print('num of pos:', np.sum(y_train), ', num of neg:', y_train.size - np.sum(y_train)) X_bal, y_bal = rus_balance.fit_resample(X_train, y_train) #BALANCED SAMPLE print('1.under:') print('num of pos:', np.sum(y_bal), ', num of neg:', y_bal.size - np.sum(y_bal)) X_bal, y_bal = sm_balance.fit_resample(X_bal, y_bal) print('2.over:') print('num of pos:', np.sum(y_bal), ', num of neg:', y_bal.size - np.sum(y_bal)) print('---------------------------------------') neigh = KNeighborsClassifier() neigh.fit(X_bal, y_bal) neigh_y_pred = neigh.predict(X_test) neigh_y_score = neigh.predict_proba(X_test)[:, 1] #nei scorer recall['neigh'].append( recall_score(y_test, neigh_y_pred, labels=None, pos_label=1)) accuracy['neigh'].append( accuracy_score(y_test, neigh_y_pred, normalize=True, sample_weight=None)) auc['neigh'].append(roc_auc_score(y_test, neigh_y_score)) print('---------') tree = DecisionTreeClassifier() tree.fit(X_bal, y_bal) tree_y_pred = tree.predict(X_test) tree_y_score = tree.predict_proba(X_test)[:, 1] #nei scorer recall['tree'].append( recall_score(y_test, tree_y_pred, labels=None, pos_label=1)) accuracy['tree'].append( accuracy_score(y_test, tree_y_pred, normalize=True, sample_weight=None)) auc['tree'].append(roc_auc_score(y_test, tree_y_score)) print('---------') naive = GaussianNB() naive.fit(X_bal, y_bal) naive_y_pred = naive.predict(X_test) naive_y_score = naive.predict_proba(X_test)[:, 1] #nei scorer recall['naive'].append( recall_score(y_test, naive_y_pred, labels=None, pos_label=1)) accuracy['naive'].append( accuracy_score(y_test, naive_y_pred, normalize=True, sample_weight=None)) auc['naive'].append(roc_auc_score(y_test, naive_y_score)) print('---------') rule = SkopeRules(feature_names=data.columns.to_list()[0:18]) rule.fit(X_bal, y_bal) rules_y_pred = rule.predict(X_test) rule_y_score = rule.rules_vote(X_test) recall['rule'].append( recall_score(y_test, rules_y_pred, labels=None, pos_label=1)) accuracy['rule'].append( accuracy_score(y_test, rules_y_pred, normalize=True, sample_weight=None)) auc['rule'].append(roc_auc_score(y_test, rule_y_score)) return accuracy, recall, auc
# Train the network, then make predictions on the test set and print the results neural_network = compute_semi_supervised_learning(neural_network, X_train, Y_train) neural_network_pred = np.array(neural_network.predict_classes( np.array(X_test))) print_statistics("Semi-Supervised Neural Network", neural_network_pred, Y_test) # ************* Rule Model: ************************ # Here we compare 3 nearest neighbour models on the validation set # First skope rules model rule_clf1 = SkopeRules(n_estimators=50, precision_min=0.2, recall_min=0.2, feature_names=feature_names) rule_clf1.fit(X_val, Y_val) rule_clf1_ypred = rule_clf1.predict(X_test_val) print_statistics("Rule Classifier - Skope Rules - Model 1", rule_clf1_ypred, Y_test_val) # Second skope rules model rule_clf2 = SkopeRules(n_estimators=50, precision_min=0.2, recall_min=0.2, feature_names=feature_names) rule_clf2.fit(X_val, Y_val) rule_clf2_ypred = rule_clf2.predict(X_test_val) print_statistics("Rule Classifier - Skope Rules - Model 2", rule_clf2_ypred, Y_test_val) # Third skope rules model rule_clf3 = SkopeRules(n_estimators=25, precision_min=0.2, recall_min=0.2,