Пример #1
0
    def _test_size(self):
        '''
        Compare several competing methods changing the ratio of the positive
        class in the dataset. We use binary class dataset for the easy of
        interpretation.
        '''
        for set_size in numpy.arange(100, 1000, 100):
            X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy()
            X_train, y_train= self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
            assert(len(y_train) == set_size)

            train_set = (X_train, y_train)
            test_set_original = (X_test, y_test)

            ms = MS2(LogisticRegression)
            ms.fit(train_set)

            r = 0.05
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
            test_set = [X_test_new, y_test_new]

            dist_true = DE.arrayToDist(y_test_new)
            dist_est = ms.predict(X_test_new)

            err = rms(dist_est, dist_true)

            print dist_est
            print "size: %d, err: %f" % (set_size, err)
Пример #2
0
    def test_class_ratio(self):
        '''
        Compare several competing methods changing the ratio of the positive
        class in the dataset. We use binary class dataset for the easy of
        interpretation.
        '''
        X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy()
        set_size = 400     # an arbitrary number
        X_train, y_train= self.get_sub_set_with_size(
                [X_train_full, y_train_full], set_size)
        assert(len(y_train) == set_size)

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)

        cc = CC2(LogisticRegression)
        ac = AC2(LogisticRegression)
        ms = MS2(LogisticRegression)
        en = EN2(LogisticRegression)

        ests = [cc, ac, ms, en]

        print "We compare the performance as changing the positive class ratio."
        print "The training set size is %d" % set_size

        print "Training classifiers"
        map(lambda e: e.fit(train_set), ests)

        print "ratio\tcc\tac\tms\ten"
        for r in numpy.arange(0.05, 1.0, 0.05):
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
            test_set = [X_test_new, y_test_new]
            errs = map(lambda e: self.run_for_estimator(e, test_set), ests)
            print ("%.2f" + "\t%.4f" * 4) % (r, errs[0], errs[1], errs[2], errs[3])
Пример #3
0
    def compare_based(self, clf_class, data_set):
        X_train, y_train, X_test, y_test = data_set
        train_set = [X_train, y_train]
        full_test_set = [X_test, y_test]

        cc = CC2(clf_class)
        ac = AC2(clf_class)
        ms = MS2(clf_class)
        it = Itr2(clf_class, itr_count = 2)
        bac = BAC(clf_class, subsample_count = 200)
        ests = [cc, ac, ms, it, bac]

        #print "Training Distribution Estimators"
        map(lambda e: e.fit(train_set), ests)

        error_matrix = []
        #print "Ratio\tCC\tAC\tMS\tEM\tBAC"
        for r in np.arange(0.2, 1.0, 0.2):
            # Generate a new test set with desired positive proportions.
            test_set = SetGen.with_pos_ratio(full_test_set, r, pos_label=1)
            errs = map(lambda e: self.run_for_estimator(e, test_set), ests)
            error_matrix.append(errs)
            #print ("%.2f" + "\t%.4f" * len(ests)) % tuple([r] + errs)

        error_matrix = np.array(error_matrix)
        avg_errs = error_matrix.mean(axis=0).tolist()
        #print ("Avg" + "\t%.4f" * len(avg_errs)) % tuple(avg_errs)

        return error_matrix
Пример #4
0
    def _test_ac_forest(self):
        X_train_full, y_train_full, X_test, y_test = rcv1_binary_reader.toNumpy()
        set_size = 200     # an arbitrary number
        X_train, y_train= self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
        assert(len(y_train) == set_size)

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)

        r = 0.8
        X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
        test_set = [X_test_new, y_test_new]

        dist_true = DE.arrayToDist(y_test_new)

        pos = []
        for i in range(500):
            train_sub = self.get_sub_set_with_size(train_set, 0.5)
            ac = AC2(LogisticRegression)
            ac.fit(train_sub)
            dist_est = ac.predict(X_test_new)
            pos.append(dist_est[1])

        print pos
        print numpy.mean(pos)
        print numpy.median(pos)
Пример #5
0
    def run_ratio(self, dataset, set_size):
        '''
        Compare several competing methods changing the ratio of the positive
        class in the dataset. We use binary class dataset for the easy of
        interpretation.
        '''
        X_train_full, y_train_full, X_test, y_test = dataset
        X_train, y_train= self.get_sub_set_with_size([X_train_full, y_train_full], set_size)

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)

        clf_class = SyntheticClassifier

        cc = CC2(clf_class)
        ac = AC2(clf_class)
        #ms = MS2(clf_class)
        #it = Itr2(clf_class, itr_count = 10)
        bac = BAC(clf_class, subsample_count = 200)

        ests = [cc, ac, bac]

        print "We compare the performance as changing the positive class ratio."
        print "The training set size is %d" % set_size

        print "Training classifiers"
        map(lambda e: e.fit(train_set), ests)

        print "ratio\tcc\tac\tbac\tit"
        for r in numpy.arange(0.05, 1.0, 0.05):
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
            test_set = [X_test_new, y_test_new]
            errs = map(lambda e: self.run_for_estimator(e, test_set), ests)
            print ("%.2f" + "\t%.4f" * len(ests)) % tuple([r] + errs)
Пример #6
0
    def test_ratio(self):
        #X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy()
        #X_train_full, y_train_full, X_test, y_test = rcv1_binary_reader.toNumpy()
        X_train_full, y_train_full, X_test, y_test = synthetic_reader.toNumpy(0.3, n_class=2)
        set_size = 500     # an arbitrary number
        X_train, y_train= self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
        assert(len(y_train) == set_size)

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)

        cc = CC2(KNeighborsClassifier)
        cc.fit(train_set)


        for r in numpy.arange(0.05, 1.0, 0.05):
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
            test_set = [X_test_new, y_test_new]

            dist_true = DE.arrayToDist(y_test_new)
            dist_est = cc.predict(X_test_new)

            err = rms(dist_est, dist_true)

            #print dist_est
            print "%f\t%f" % (dist_true[1], dist_est[1])
Пример #7
0
    def test_class_ratio(self):
        '''
        Compare several competing methods changing the ratio of the positive
        class in the dataset. We use binary class dataset for the easy of
        interpretation.
        '''
        #X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy()
        X_train_full, y_train_full, X_test, y_test = news_20_reader.toNumpy()
        set_size = 1000
        X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size)

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)

        ms = MSHI(LinearSVC)
        ms.fit(train_set)

        print 'Done training'

        for r in numpy.arange(0.05, 1.0, 0.05):
        #for r in [0.05]:
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
            test_set = [X_test_new, y_test_new]

            dist_true = DE.arrayToDist(y_test_new)
            dist_est = ms.predict(X_test_new)

            err = rms(dist_est, dist_true)

            print "r: %f, pos: %f" % (r, dist_est[1])
Пример #8
0
    def run_training_size(self, pos_ratio):
        X_train_full, y_train_full, X_test, y_test = rcv1_binary_reader.toNumpy()
        test_set_original = [X_test, y_test]
        X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, pos_ratio, pos_label=1)
        test_set = [X_test_new, y_test_new]


        print "We compare performance as chaning the training set size."
        print "Positive class ratio is %f" % pos_ratio
        print "size\tcc\tac\tms\tra\trc\trb\trd"
        for set_size in (numpy.arange(60, 100, 10).tolist()
                + numpy.arange(100, 1100, 100).tolist()
                + [2000, 3000, 4000, 5000, 10000, 20000]):
            cc = CC2(LogisticRegression)
            ac = AC2(LogisticRegression)
            ms = MS2(LogisticRegression)
            ra = RA(LogisticRegression, ac_method = 'ac')
            rc = RA(LogisticRegression, ac_method = 'cac')
            rb = RA(LogisticRegression, ac_method = 'bac')
            rd = RA(LogisticRegression, ac_method = 'dac')

            ests = [cc, ac, ms, ra, rc, rb, rd]

            X_train_sub, y_train_sub = self.get_sub_set_with_size(
                    [X_train_full, y_train_full], set_size)
            train_set = [X_train_sub, y_train_sub]
            map(lambda e: e.fit(train_set), ests)

            errs = map(lambda e: self.run_for_estimator(e, test_set), ests)
            print ("%d" + "\t%.4f" * 7) % (set_size, errs[0], errs[1], errs[2],
                    errs[3], errs[4], errs[5], errs[6])
Пример #9
0
    def test(self):
        X_train, y_train, X_test, y_test = news_20_reader.toNumpy()
        X_test, y_test = SetGen.with_pos_ratio([X_test, y_test], 0.50, pos_label=1)

        clf = LogisticRegression
        it = Itr2(clf, 2)
        it.fit([X_train, y_train])
        dist = it.predict(X_test)

        print dist
Пример #10
0
    def _test_debug(self):
        X_train_full, y_train_full, X_test, y_test = snippet_reader.toNumpy()
        set_size = 500     # an arbitrary number
        X_train, y_train = self.get_sub_set_with_size(
                [X_train_full, y_train_full], set_size)
        assert(len(y_train) == set_size)

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)
        X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, 0.5, pos_label=1)
        test_set = [X_test_new, y_test_new]

        en = EN(LogisticRegression, debug=True)

        self.run_for_estimator(en, train_set, test_set, debug=True)
Пример #11
0
    def _test_debug(self):
        X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy()
        set_size = 300  # an arbitrary number
        X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
        assert len(y_train) == set_size

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)
        X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, 0.05, pos_label=1)
        test_set = [X_test_new, y_test_new]

        cc = CC(LogisticRegression)
        en = EN(LogisticRegression, debug=True)

        err = self.run_for_estimator(en, train_set, test_set, debug=True)
        print "err", err
Пример #12
0
    def compute_avg_error(self, estimator, test_set, pos_label=1):
        """
        @param X_train and y_train are used for training distribution estimator.
        @param estimator a class distribution estimator. should have method
        estimate()
        @param X_test and y_test are used to test the estimation method.
        """
        errors = []

        # run changing ratio of positive class
        for r in numpy.arange(0.05, 1.0, 0.05):
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set, r, pos_label=1)
            test_set_new = [X_test_new, y_test_new]
            err = self.run_for_estimator(estimator, test_set_new)
            errors.append(err)

        return numpy.mean(errors)
Пример #13
0
    def run_ratio(self, dataset, set_size):
        '''
        Compare several competing methods changing the ratio of the positive
        class in the dataset. We use binary class dataset for the easy of
        interpretation.
        '''
        X_train_full, y_train_full, X_test_full, y_test_full = dataset
        X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
        assert(len(y_train) == set_size)

        train_set = (X_train, y_train)
        test_set_original = (X_test_full, y_test_full)

        #clf_class = LogisticRegression
        def rf_factory():
            clf = RandomForestClassifier(n_estimators=500)
            return clf

        clf_class = rf_factory

        cc = CC2(clf_class)
        ac = AC2(clf_class)
        ms = MS2(clf_class)
        it = Itr2(clf_class, itr_count = 2)
        ra = RA(clf_class, ac_method = 'ac', subsample_count = 500)
        ra3 = RA3(clf_class, subsample_count = 500)
        #en = EN7(clf_class)

        ests = [cc, ac, ms, ra3]

        print "We compare the performance as changing the positive class ratio."
        print "The training set size is %d" % set_size

        print "Training classifiers"
        map(lambda e: e.fit(train_set), ests)

        print "ratio\tcc\tac\tms\tra3"
        for r in numpy.arange(0.05, 1.0, 0.05):
        #for r in [0.5]:
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=0)
            test_set = [X_test_new, y_test_new]
            errs = map(lambda e: self.run_for_estimator(e, test_set), ests)
            print ("%.2f" + "\t%.4f" * len(ests)) % tuple([r] + errs)
Пример #14
0
    def run_ratio(self, dataset, set_size):
        '''
        Compare several competing methods changing the ratio of the positive
        class in the dataset. We use binary class dataset for the easy of
        interpretation.
        '''
        X_train_full, y_train_full, X_test, y_test = dataset
        X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
        assert(len(y_train) == set_size)

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)

        clf_class = LinearSVC

        cc = CC2(clf_class)
        ac = AC2(clf_class)
        ms = MS2(clf_class)
        it = Itr2(clf_class, itr_count = 2)
        ra = RA(clf_class, ac_method = 'ac', subsample_count = 400)
        ra3 = RA3(clf_class, split_r = 0.4, subsample_count = 1000)
        #en = EN7(clf_class)

        #ests = [cc, ac, ms, it, ra, ra3]
        ests = [cc, ac, it, ra3]

        print "We compare the performance as changing the positive class ratio."
        print "The training set size is %d" % set_size

        print "Training classifiers"
        map(lambda e: e.fit(train_set), ests)

        print "ratio\tcc\tac\tit\tra3"
        for r in numpy.arange(0.05, 1.0, 0.05):
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
            test_set = [X_test_new, y_test_new]
            errs = map(lambda e: self.run_for_estimator(e, test_set), ests)
            print ("%.2f" + "\t%.4f" * len(ests)) % tuple([r] + errs)
Пример #15
0
    def _test_rna_change_training_size(self):
        '''
        Do compare using ncRNA dataset.

        The dataset is from
            Andrew V Uzilov, Joshua M Keegan, and David H Mathews. 
            Detection of non-coding RNAs on the basis of predicted secondary
            structure formation free energy change. 
            BMC Bioinformatics, 7(173), 2006.
        '''
        X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy()
        test_set_original = [X_test, y_test]
        pos_ratio = 0.8     # arbtrary ratio
        X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, pos_ratio, pos_label=1)
        test_set = [X_test_new, y_test_new]


        print "We compare performance as chaning the training set size."
        print "Positive class ratio is %f" % pos_ratio
        print "ratio\tcc\tac\tms\ten"
        #for set_size in [600, 300, 400, 500, 700, 800, 900, 1000]:
        for set_size in [800, 900, 1000, 1500, 2000, 2500, 3000]:
        #for set_size in numpy.arange(1500, 5000, 500):
        #for set_size in [500, 600, 700, 800, 900, 1000]:
            cc = CC2(LogisticRegression)
            ac = AC2(LogisticRegression)
            ms = MS2(LogisticRegression)
            en = EN2(LogisticRegression)
            ests = [cc, ac, ms, en]

            X_train_sub, y_train_sub = self.get_sub_set_with_size(
                    [X_train_full, y_train_full], set_size)
            train_set = [X_train_sub, y_train_sub]
            map(lambda e: e.fit(train_set), ests)

            errs = map(lambda e: self.run_for_estimator(e, test_set), ests)
            print ("%d" + "\t%.4f" * 4) % (set_size, errs[0], errs[1], errs[2], errs[3])
Пример #16
0
    def _test_change_training_size(self):
        '''
        Do compare using ncRNA dataset.

        The dataset is from
            Andrew V Uzilov, Joshua M Keegan, and David H Mathews. 
            Detection of non-coding RNAs on the basis of predicted secondary
            structure formation free energy change. 
            BMC Bioinformatics, 7(173), 2006.
        '''
        X_train_full, y_train_full, X_test, y_test = snippet_reader.toNumpy()
        test_set_original = [X_test, y_test]
        pos_ratio = 0.7     # arbtrary ratio
        X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, pos_ratio, pos_label=1)
        test_set = [X_test_new, y_test_new]

        cc = CC(LogisticRegression)
        ac = AC(LogisticRegression)
        ms = MS(LogisticRegression)
        it = Itr(LogisticRegression)
        en = EN(LogisticRegression)

        ests = [cc, ac, ms, it, en]

        print "We compare performance as chaning the training set size."
        print "Fixed positive ratio is %f" % pos_ratio
        print "size\tcc\tac\tms\tit\ten"
        for set_size in [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000]:
            X_train_sub, y_train_sub = self.get_sub_set_with_size(
                    [X_train_full, y_train_full], set_size)

            train_set = [X_train_sub, y_train_sub]
            en.find_hyperparameter(train_set)
            errs = map(lambda e: self.run_for_estimator(e, train_set, test_set), ests)
            print ("%d" + "\t%.4f" * 5) % (set_size,
                    errs[0], errs[1], errs[2], errs[3], errs[4])
Пример #17
0
    def run_training_size(self, pos_ratio):
        """
        Do compare using ncRNA dataset.

        The dataset is from
            Andrew V Uzilov, Joshua M Keegan, and David H Mathews. 
            Detection of non-coding RNAs on the basis of predicted secondary
            structure formation free energy change. 
            BMC Bioinformatics, 7(173), 2006.
        """
        X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy()
        test_set_original = [X_test, y_test]
        X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, pos_ratio, pos_label=1)
        test_set = [X_test_new, y_test_new]

        print "RNA dataset"
        print "We compare performance as chaning the training set size."
        print "Positive class ratio is %f" % pos_ratio
        print "size\tcc\tac\tms\tra\trc\trb\trd"
        for set_size in numpy.arange(100, 1100, 100).tolist() + [2000, 3000, 4000, 5000, 10000, 20000]:
            cc = CC2(LogisticRegression)
            ac = AC2(LogisticRegression)
            ms = MS2(LogisticRegression)
            ra = RA(LogisticRegression, ac_method="ac")
            rc = RA(LogisticRegression, ac_method="cac")
            rb = RA(LogisticRegression, ac_method="bac")
            rd = RA(LogisticRegression, ac_method="dac")

            ests = [cc, ac, ms, ra, rc, rb, rd]

            X_train_sub, y_train_sub = self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
            train_set = [X_train_sub, y_train_sub]
            map(lambda e: e.fit(train_set), ests)

            errs = map(lambda e: self.run_for_estimator(e, test_set), ests)
            print ("%d" + "\t%.4f" * 7) % (set_size, errs[0], errs[1], errs[2], errs[3], errs[4], errs[5], errs[6])
Пример #18
0
    def compare_based2(self, clf_class, data_set, ratio):
        r = ratio

        X_train, y_train, X_test, y_test = data_set
        train_set = [X_train, y_train]
        full_test_set = [X_test, y_test]

        cc = CC2(clf_class)
        ac = AC2(clf_class)
        ms = MS2(clf_class)
        it = Itr2(clf_class, itr_count = 2)
        bac = BAC(clf_class, subsample_count = 400)
        ests = [cc, ac, ms, it, bac]

        #print "Training Distribution Estimators"
        map(lambda e: e.fit(train_set), ests)


        # Generate a new test set with desired positive proportions.
        test_set = SetGen.with_pos_ratio(full_test_set, r, pos_label=1)
        errs = map(lambda e: self.run_for_estimator(e, test_set), ests)
        #print ("%.2f" + "\t%.4f" * len(ests)) % tuple([X_train.shape[0]] + errs)

        return errs