예제 #1
0
def test_deduplication_works():
    # toy sample (the last two samples are outliers)
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]]
    y = [0] * 6 + [1] * 2
    X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5],
              [5, -7]]
    # Test LOF
    clf = SkopeRules(random_state=rng, max_samples=1., max_depth_duplication=3)
    clf.fit(X, y)
    decision_func = clf.decision_function(X_test)
    rules_vote = clf.rules_vote(X_test)
    score_top_rules = clf.score_top_rules(X_test)
    pred = clf.predict(X_test)
    pred_score_top_rules = clf.predict_top_rules(X_test, 1)
예제 #2
0
def test_skope_rules_works():
    # toy sample (the last two samples are outliers)
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]]
    y = [0] * 6 + [1] * 2
    X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5],
              [5, -7]]
    # Test LOF
    clf = SkopeRules(random_state=rng, max_samples=1.)
    clf.fit(X, y)
    decision_func = clf.decision_function(X_test)
    rules_vote = clf.rules_vote(X_test)
    separate_rules_score = clf.separate_rules_score(X_test)
    pred = clf.predict(X_test)
    # assert detect outliers:
    assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2]))
    assert_greater(np.min(rules_vote[-2:]), np.max(rules_vote[:-2]))
    assert_greater(np.min(separate_rules_score[-2:]),
                   np.max(separate_rules_score[:-2]))
    assert_array_equal(pred, 6 * [0] + 2 * [1])
예제 #3
0
def OverSampleKfold(data, k=10):
    data_np = data.to_numpy()
    X = data_np[:, 0:18]
    y = data_np[:, 18]
    sm = SMOTE()
    kf = KFold(n_splits=k, shuffle=True)
    accuracy = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}
    recall = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}
    auc = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]  #test set
        print('num of pos:', np.sum(y_train), ', num of neg:',
              y_train.size - np.sum(y_train))

        X_over, y_over = sm.fit_resample(X_train,
                                         y_train)  #oversampled train set
        print('oversample:', X_over.shape, y_over.shape)
        print('---------------------------------------')

        neigh = KNeighborsClassifier()
        neigh.fit(X_over, y_over)
        neigh_y_pred = neigh.predict(X_test)
        neigh_y_score = neigh.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['neigh'].append(
            recall_score(y_test, neigh_y_pred, labels=None, pos_label=1))
        accuracy['neigh'].append(
            accuracy_score(y_test,
                           neigh_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['neigh'].append(roc_auc_score(y_test, neigh_y_score))

        print('---------')
        tree = DecisionTreeClassifier()
        tree.fit(X_over, y_over)
        tree_y_pred = tree.predict(X_test)
        tree_y_score = tree.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['tree'].append(
            recall_score(y_test, tree_y_pred, labels=None, pos_label=1))
        accuracy['tree'].append(
            accuracy_score(y_test,
                           tree_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['tree'].append(roc_auc_score(y_test, tree_y_score))

        print('---------')
        naive = GaussianNB()
        naive.fit(X_over, y_over)
        naive_y_pred = naive.predict(X_test)
        naive_y_score = naive.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['naive'].append(
            recall_score(y_test, naive_y_pred, labels=None, pos_label=1))
        accuracy['naive'].append(
            accuracy_score(y_test,
                           naive_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['naive'].append(roc_auc_score(y_test, naive_y_score))

        print('---------')
        rule = SkopeRules(feature_names=data.columns.to_list()[0:18])
        rule.fit(X_over, y_over)
        rules_y_pred = rule.predict(X_test)
        rule_y_score = rule.rules_vote(X_test)
        recall['rule'].append(
            recall_score(y_test, rules_y_pred, labels=None, pos_label=1))
        accuracy['rule'].append(
            accuracy_score(y_test,
                           rules_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['rule'].append(roc_auc_score(y_test, rule_y_score))

    return accuracy, recall, auc
예제 #4
0
def BalanceSampleKfold(data, k=10):
    data_np = data.to_numpy()
    X = data_np[:, 0:18]
    y = data_np[:, 18]
    rus_balance = RandomUnderSampler(
        sampling_strategy=0.20)  #truncate neg to 5*#pos
    sm_balance = SMOTE()  #then oversample pos
    kf = KFold(n_splits=10, shuffle=True)

    accuracy = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}
    recall = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}
    auc = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]  #test set
        print('num of pos:', np.sum(y_train), ', num of neg:',
              y_train.size - np.sum(y_train))
        X_bal, y_bal = rus_balance.fit_resample(X_train,
                                                y_train)  #BALANCED SAMPLE
        print('1.under:')
        print('num of pos:', np.sum(y_bal), ', num of neg:',
              y_bal.size - np.sum(y_bal))
        X_bal, y_bal = sm_balance.fit_resample(X_bal, y_bal)
        print('2.over:')
        print('num of pos:', np.sum(y_bal), ', num of neg:',
              y_bal.size - np.sum(y_bal))
        print('---------------------------------------')

        neigh = KNeighborsClassifier()
        neigh.fit(X_bal, y_bal)
        neigh_y_pred = neigh.predict(X_test)
        neigh_y_score = neigh.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['neigh'].append(
            recall_score(y_test, neigh_y_pred, labels=None, pos_label=1))
        accuracy['neigh'].append(
            accuracy_score(y_test,
                           neigh_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['neigh'].append(roc_auc_score(y_test, neigh_y_score))

        print('---------')
        tree = DecisionTreeClassifier()
        tree.fit(X_bal, y_bal)
        tree_y_pred = tree.predict(X_test)
        tree_y_score = tree.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['tree'].append(
            recall_score(y_test, tree_y_pred, labels=None, pos_label=1))
        accuracy['tree'].append(
            accuracy_score(y_test,
                           tree_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['tree'].append(roc_auc_score(y_test, tree_y_score))

        print('---------')
        naive = GaussianNB()
        naive.fit(X_bal, y_bal)
        naive_y_pred = naive.predict(X_test)
        naive_y_score = naive.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['naive'].append(
            recall_score(y_test, naive_y_pred, labels=None, pos_label=1))
        accuracy['naive'].append(
            accuracy_score(y_test,
                           naive_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['naive'].append(roc_auc_score(y_test, naive_y_score))

        print('---------')
        rule = SkopeRules(feature_names=data.columns.to_list()[0:18])
        rule.fit(X_bal, y_bal)
        rules_y_pred = rule.predict(X_test)
        rule_y_score = rule.rules_vote(X_test)
        recall['rule'].append(
            recall_score(y_test, rules_y_pred, labels=None, pos_label=1))
        accuracy['rule'].append(
            accuracy_score(y_test,
                           rules_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['rule'].append(roc_auc_score(y_test, rule_y_score))

    return accuracy, recall, auc