Exemplo n.º 1
0
    def _test_size(self):
        '''
        Compare several competing methods changing the ratio of the positive
        class in the dataset. We use binary class dataset for the easy of
        interpretation.
        '''
        for set_size in numpy.arange(100, 1000, 100):
            X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy()
            X_train, y_train= self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
            assert(len(y_train) == set_size)

            train_set = (X_train, y_train)
            test_set_original = (X_test, y_test)

            ms = MS2(LogisticRegression)
            ms.fit(train_set)

            r = 0.05
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
            test_set = [X_test_new, y_test_new]

            dist_true = DE.arrayToDist(y_test_new)
            dist_est = ms.predict(X_test_new)

            err = rms(dist_est, dist_true)

            print dist_est
            print "size: %d, err: %f" % (set_size, err)
Exemplo n.º 2
0
    def run_for_estimator(self, estimator, test_set):
        X_test, y_test = test_set
        dist = estimator.predict(X_test)
        assert len(dist) == 2

        err = DE.rms(y_test, dist)
        return err
Exemplo n.º 3
0
Arquivo: ac2.py Projeto: pyongjoo/ende
    def _test_ac_forest(self):
        X_train_full, y_train_full, X_test, y_test = rcv1_binary_reader.toNumpy()
        set_size = 200     # an arbitrary number
        X_train, y_train= self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
        assert(len(y_train) == set_size)

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)

        r = 0.8
        X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
        test_set = [X_test_new, y_test_new]

        dist_true = DE.arrayToDist(y_test_new)

        pos = []
        for i in range(500):
            train_sub = self.get_sub_set_with_size(train_set, 0.5)
            ac = AC2(LogisticRegression)
            ac.fit(train_sub)
            dist_est = ac.predict(X_test_new)
            pos.append(dist_est[1])

        print pos
        print numpy.mean(pos)
        print numpy.median(pos)
Exemplo n.º 4
0
    def test_class_ratio(self):
        '''
        Compare several competing methods changing the ratio of the positive
        class in the dataset. We use binary class dataset for the easy of
        interpretation.
        '''
        #X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy()
        X_train_full, y_train_full, X_test, y_test = news_20_reader.toNumpy()
        set_size = 1000
        X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size)

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)

        ms = MSHI(LinearSVC)
        ms.fit(train_set)

        print 'Done training'

        for r in numpy.arange(0.05, 1.0, 0.05):
        #for r in [0.05]:
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
            test_set = [X_test_new, y_test_new]

            dist_true = DE.arrayToDist(y_test_new)
            dist_est = ms.predict(X_test_new)

            err = rms(dist_est, dist_true)

            print "r: %f, pos: %f" % (r, dist_est[1])
Exemplo n.º 5
0
Arquivo: cc2.py Projeto: pyongjoo/ende
    def test_ratio(self):
        #X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy()
        #X_train_full, y_train_full, X_test, y_test = rcv1_binary_reader.toNumpy()
        X_train_full, y_train_full, X_test, y_test = synthetic_reader.toNumpy(0.3, n_class=2)
        set_size = 500     # an arbitrary number
        X_train, y_train= self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
        assert(len(y_train) == set_size)

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)

        cc = CC2(KNeighborsClassifier)
        cc.fit(train_set)


        for r in numpy.arange(0.05, 1.0, 0.05):
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
            test_set = [X_test_new, y_test_new]

            dist_true = DE.arrayToDist(y_test_new)
            dist_est = cc.predict(X_test_new)

            err = rms(dist_est, dist_true)

            #print dist_est
            print "%f\t%f" % (dist_true[1], dist_est[1])
Exemplo n.º 6
0
def adjust_count(y_pred, cm):
    ncm = cm / cm.sum(1, dtype=numpy.float64)[:, numpy.newaxis]
    tpr = ncm[1,1]
    fpr = ncm[0,1]
    pp = DE.to_bin_dist(y_pred)[1]

    if tpr - fpr < .25:
        raise TooLittleDifferenceException

    new_pos = (pp - fpr) / float(tpr - fpr)

    dist_est = numpy.array([1-new_pos, new_pos])
    dist_est[dist_est<0] = 0
    dist_est[dist_est>1] = 1
    return dist_est / dist_est.sum()
Exemplo n.º 7
0
Arquivo: ac2.py Projeto: pyongjoo/ende
    def adjust_count(self, y_pred, cm):
        ncm = cm / cm.sum(1, dtype=numpy.float64)[:, numpy.newaxis]
        tpr = ncm[1,1]
        fpr = ncm[0,1]
        pp = DE.to_bin_dist(y_pred)[1]

        if tpr == fpr:
            new_pos = .5
        else:
            new_pos = (pp - fpr) / float(tpr - fpr)

        dist_est = numpy.array([1-new_pos, new_pos])

        if self.cap:
            dist_est[dist_est<0] = 0
            dist_est[dist_est>1] = 1

        return dist_est / dist_est.sum()
Exemplo n.º 8
0
Arquivo: it2.py Projeto: pyongjoo/ende
    def predict_binary(self, X_population, params):
        clf, pos2neg, X, y = params

        cost_fp = 1.0
        cost_fn = 1.0

        for i in range(self.itr_count):
            new_label = clf.predict(X_population)
            cn = Counter(new_label)

            # We add a small prior (1) to prevent divide by zero error. This is
            # not mentioned in the original paper, but we add this for fair
            # comparison.
            pos2neg_new = (cn[1] + 1) / float(cn[0] + 1)
            cost_fp = pos2neg / pos2neg_new

            # High cost means the examples are important, thus we should weigh
            # them more.
            clf = self.base_clf_class(class_weight = {0: cost_fn, 1:cost_fp})
            clf.fit(X, y)

        y_pred = clf.predict(X_population)
        return DE.to_bin_dist(y_pred)
Exemplo n.º 9
0
Arquivo: cc2.py Projeto: pyongjoo/ende
 def predict_binary(self, X_population, params):
     ''' prediction outcome is length-2 array. '''
     clf = params
     y_pred = clf.predict(X_population)
     return DE.to_bin_dist(y_pred)