Пример #1
0
 def test_ratio(self):
     dataset = rcv1_binary_reader.toNumpy()
     set_size = 100
     for set_size in numpy.arange(100, 200, 100):
         print "Training set size is", set_size
         self.run_ratio(dataset, set_size)
         print
Пример #2
0
def test_ratio():
    '''
    Compare several competing methods changing the ratio of the positive
    class in the dataset. We use binary class dataset for the easy of
    interpretation.
    '''
    dataset = rcv1_binary_reader.toNumpy()
    set_size = 100

    X_train_full, y_train_full, X_test_full, y_test_full = dataset
    X_train, y_train = get_sub_set_with_size([X_train_full, y_train_full], set_size)
    assert(len(y_train) == set_size)
    X_test, y_test = get_sub_set_with_size([X_test_full, y_test_full], 10000)

    train_set = (X_train, y_train)
    test_set_original = (X_test, y_test)

    save_libsvm(X_train, y_train, 'rcv_train_%d.libsvm' % set_size)

    #for r in np.arange(0.05, 1.0, 0.05):
    r = 0.05
    # Generate a new test set with desired positive proportions.
    X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
    test_set = [X_test_new, y_test_new]

    save_libsvm(X_test_new, y_test_new, 'rcv_test_%.2f.libsvm' % r)
Пример #3
0
    def test_ratio(self):
        '''
        Compare several competing methods changing the ratio of the positive
        class in the dataset. We use binary class dataset for the easy of
        interpretation.
        '''
        dataset = rcv1_binary_reader.toNumpy()
        set_size = 100

        X_train_full, y_train_full, X_test, y_test = dataset
        X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
        assert(len(y_train) == set_size)

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)

        clf = LogisticRegression()
        clf.fit(X_train, y_train)

        p = Prior(clf)

        for r in np.arange(0.05, 1.0, 0.05):
            # Generate a new test set with desired positive proportions.
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
            test_set = [X_test_new, y_test_new]

            true_pos = DE.arrayToDist(y_test_new)[1]

            p.fit(X_train, y_train, {-1:1-true_pos, 1:true_pos})
            y_pred = p.predict(X_test_new)
            cm = confusion_matrix(y_test_new, y_pred)
            acc = self.accuracy(cm)

            print r, acc
Пример #4
0
    def run_training_size(self, pos_ratio):
        X_train_full, y_train_full, X_test, y_test = rcv1_binary_reader.toNumpy()
        test_set_original = [X_test, y_test]
        X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, pos_ratio, pos_label=1)
        test_set = [X_test_new, y_test_new]


        print "We compare performance as chaning the training set size."
        print "Positive class ratio is %f" % pos_ratio
        print "size\tcc\tac\tms\tra\trc\trb\trd"
        for set_size in (numpy.arange(60, 100, 10).tolist()
                + numpy.arange(100, 1100, 100).tolist()
                + [2000, 3000, 4000, 5000, 10000, 20000]):
            cc = CC2(LogisticRegression)
            ac = AC2(LogisticRegression)
            ms = MS2(LogisticRegression)
            ra = RA(LogisticRegression, ac_method = 'ac')
            rc = RA(LogisticRegression, ac_method = 'cac')
            rb = RA(LogisticRegression, ac_method = 'bac')
            rd = RA(LogisticRegression, ac_method = 'dac')

            ests = [cc, ac, ms, ra, rc, rb, rd]

            X_train_sub, y_train_sub = self.get_sub_set_with_size(
                    [X_train_full, y_train_full], set_size)
            train_set = [X_train_sub, y_train_sub]
            map(lambda e: e.fit(train_set), ests)

            errs = map(lambda e: self.run_for_estimator(e, test_set), ests)
            print ("%d" + "\t%.4f" * 7) % (set_size, errs[0], errs[1], errs[2],
                    errs[3], errs[4], errs[5], errs[6])
Пример #5
0
    def _test_ac_forest(self):
        X_train_full, y_train_full, X_test, y_test = rcv1_binary_reader.toNumpy()
        set_size = 200     # an arbitrary number
        X_train, y_train= self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
        assert(len(y_train) == set_size)

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)

        r = 0.8
        X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
        test_set = [X_test_new, y_test_new]

        dist_true = DE.arrayToDist(y_test_new)

        pos = []
        for i in range(500):
            train_sub = self.get_sub_set_with_size(train_set, 0.5)
            ac = AC2(LogisticRegression)
            ac.fit(train_sub)
            dist_est = ac.predict(X_test_new)
            pos.append(dist_est[1])

        print pos
        print numpy.mean(pos)
        print numpy.median(pos)
Пример #6
0
    def _test_random_forest_based(self):
        dataset = rcv1_binary_reader.toNumpy()
        n_feature = 5000

        print "Dataset: RCV1, Classifier: Random Forest"
        print
        #self.run_ratio(RFWrapper, dataset, n_feature)
        self.run_size(RFWrapper, dataset, n_feature)
Пример #7
0
    def _test_rcv1_binary_chainging_size(self):
        X_train, y_train, X_test, y_test = rcv1_binary_reader.toNumpy()
        train_set = (X_train, y_train)

        #for size in numpy.arange(1000, 10000, 1000):
        for size in numpy.arange(100, 1000, 100):
            X_sub, y_sub = self.get_sub_set_with_size(train_set, size)
            clf = LinearSVC()
            clf.fit(X_sub, y_sub)
            print "size: %d, accuracy: %f" % (size, clf.score(X_test, y_test))
Пример #8
0
    def _test_rcv1_binary_dataset(self):
        X_train, y_train, X_test, y_test = rcv1_binary_reader.toNumpy()

        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        print "logistic score", clf.score(X_test, y_test)

        clf = LinearSVC()
        clf.fit(X_train, y_train)
        print "svm score", clf.score(X_test, y_test)
Пример #9
0
    def test_ratio(self):
        '''
        Compare several competing methods changing the ratio of the positive
        class in the dataset. We use binary class dataset for the easy of
        interpretation.
        '''
        dataset = rcv1_binary_reader.toNumpy()
        set_size = 100

        X_train_full, y_train_full, X_test_full, y_test_full = dataset
        X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
        assert(len(y_train) == set_size)
        X_test, y_test = self.get_sub_set_with_size([X_test_full, y_test_full], 10000)

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)

        rfw = RFWeights()
        svmw = SVMWeights()

        rf = RandomForestClassifier(n_estimators=400)
        svm = LinearSVC()

        rf.fit(X_train.toarray(), y_train)
        svm.fit(X_train, y_train)

        print "Ratio\tSVM\tSVMW\tRF\tRFW"
        for r in np.arange(0.05, 1.0, 0.05):
            # Generate a new test set with desired positive proportions.
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
            test_set = [X_test_new, y_test_new]

            true_pos = DE.to_bin_dist(y_test_new)[1]
            new_class_dist = {0:1-true_pos, 1:true_pos}

            rfw.fit(X_train, y_train, new_class_dist)
            svmw.fit(X_train, y_train, new_class_dist)

            svm_pred = svm.predict(X_test_new)
            svmw_pred = svmw.predict(X_test_new)
            rf_pred = rf.predict(X_test_new.toarray())
            rfw_pred = rfw.predict(X_test_new.toarray())

            preds = [svm_pred, svmw_pred, rf_pred, rfw_pred]
            pos_ratios = map(lambda x: DE.to_bin_dist(x)[1], preds)

            print ("%.2f" + "\t%.2f" * len(pos_ratios)) % tuple([r] + pos_ratios)
Пример #10
0
    def _test_avg(self):
        dataset = rcv1_binary_reader.toNumpy()
        train_set_size = 300

        X_train_full, y_train_full, X_test_full, y_test_full = dataset
        X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], train_set_size)
        X_test, y_test = self.get_sub_set_with_size([X_test_full, y_test_full], 10000)

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)

        clf_class = LogisticRegression

        for split_r in numpy.arange(0.1, 1.0, 0.1):
            ra = RA(clf_class, ac_method = 'ac', subsample_count = 200, split_r=split_r)
            ra.fit(train_set)
            err = self.compute_avg_error(ra, test_set_original)
            print split_r, err
Пример #11
0
 def test_ratio(self):
     dataset = rcv1_binary_reader.toNumpy()
     for set_size in numpy.arange(50, 200, 10):
         self.run_ratio(dataset, set_size)
         print
Пример #12
0
    def _test_random_forest_based(self):
        dataset = rcv1_binary_reader.toNumpy()
        n_feature = 5000

        print "RCV1 with RF"
        self.run_test_with(dataset, RFWrapper, n_feature, dense=True)
Пример #13
0
    def test_svm_based4(self):
        dataset = rcv1_binary_reader.toNumpy()
        n_feature = 5000

        print "RCV1 with SVM"
        self.run_test_with(dataset, LinearSVC, n_feature)
Пример #14
0
 def _test_rf_based(self):
     print "Compare RCV1 with RF"
     dataset = rcv1_binary_reader.toNumpy()
     n_feature = 5000
     self.run_test_with(dataset, self.compare_rf_based, n_feature)