def _test_size(self): ''' Compare several competing methods changing the ratio of the positive class in the dataset. We use binary class dataset for the easy of interpretation. ''' for set_size in numpy.arange(100, 1000, 100): X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy() X_train, y_train= self.get_sub_set_with_size([X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) ms = MS2(LogisticRegression) ms.fit(train_set) r = 0.05 X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] dist_true = DE.arrayToDist(y_test_new) dist_est = ms.predict(X_test_new) err = rms(dist_est, dist_true) print dist_est print "size: %d, err: %f" % (set_size, err)
def test_class_ratio(self): ''' Compare several competing methods changing the ratio of the positive class in the dataset. We use binary class dataset for the easy of interpretation. ''' X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy() set_size = 400 # an arbitrary number X_train, y_train= self.get_sub_set_with_size( [X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) cc = CC2(LogisticRegression) ac = AC2(LogisticRegression) ms = MS2(LogisticRegression) en = EN2(LogisticRegression) ests = [cc, ac, ms, en] print "We compare the performance as changing the positive class ratio." print "The training set size is %d" % set_size print "Training classifiers" map(lambda e: e.fit(train_set), ests) print "ratio\tcc\tac\tms\ten" for r in numpy.arange(0.05, 1.0, 0.05): X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] errs = map(lambda e: self.run_for_estimator(e, test_set), ests) print ("%.2f" + "\t%.4f" * 4) % (r, errs[0], errs[1], errs[2], errs[3])
def compare_based(self, clf_class, data_set): X_train, y_train, X_test, y_test = data_set train_set = [X_train, y_train] full_test_set = [X_test, y_test] cc = CC2(clf_class) ac = AC2(clf_class) ms = MS2(clf_class) it = Itr2(clf_class, itr_count = 2) bac = BAC(clf_class, subsample_count = 200) ests = [cc, ac, ms, it, bac] #print "Training Distribution Estimators" map(lambda e: e.fit(train_set), ests) error_matrix = [] #print "Ratio\tCC\tAC\tMS\tEM\tBAC" for r in np.arange(0.2, 1.0, 0.2): # Generate a new test set with desired positive proportions. test_set = SetGen.with_pos_ratio(full_test_set, r, pos_label=1) errs = map(lambda e: self.run_for_estimator(e, test_set), ests) error_matrix.append(errs) #print ("%.2f" + "\t%.4f" * len(ests)) % tuple([r] + errs) error_matrix = np.array(error_matrix) avg_errs = error_matrix.mean(axis=0).tolist() #print ("Avg" + "\t%.4f" * len(avg_errs)) % tuple(avg_errs) return error_matrix
def _test_ac_forest(self): X_train_full, y_train_full, X_test, y_test = rcv1_binary_reader.toNumpy() set_size = 200 # an arbitrary number X_train, y_train= self.get_sub_set_with_size([X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) r = 0.8 X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] dist_true = DE.arrayToDist(y_test_new) pos = [] for i in range(500): train_sub = self.get_sub_set_with_size(train_set, 0.5) ac = AC2(LogisticRegression) ac.fit(train_sub) dist_est = ac.predict(X_test_new) pos.append(dist_est[1]) print pos print numpy.mean(pos) print numpy.median(pos)
def run_ratio(self, dataset, set_size): ''' Compare several competing methods changing the ratio of the positive class in the dataset. We use binary class dataset for the easy of interpretation. ''' X_train_full, y_train_full, X_test, y_test = dataset X_train, y_train= self.get_sub_set_with_size([X_train_full, y_train_full], set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) clf_class = SyntheticClassifier cc = CC2(clf_class) ac = AC2(clf_class) #ms = MS2(clf_class) #it = Itr2(clf_class, itr_count = 10) bac = BAC(clf_class, subsample_count = 200) ests = [cc, ac, bac] print "We compare the performance as changing the positive class ratio." print "The training set size is %d" % set_size print "Training classifiers" map(lambda e: e.fit(train_set), ests) print "ratio\tcc\tac\tbac\tit" for r in numpy.arange(0.05, 1.0, 0.05): X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] errs = map(lambda e: self.run_for_estimator(e, test_set), ests) print ("%.2f" + "\t%.4f" * len(ests)) % tuple([r] + errs)
def test_ratio(self): #X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy() #X_train_full, y_train_full, X_test, y_test = rcv1_binary_reader.toNumpy() X_train_full, y_train_full, X_test, y_test = synthetic_reader.toNumpy(0.3, n_class=2) set_size = 500 # an arbitrary number X_train, y_train= self.get_sub_set_with_size([X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) cc = CC2(KNeighborsClassifier) cc.fit(train_set) for r in numpy.arange(0.05, 1.0, 0.05): X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] dist_true = DE.arrayToDist(y_test_new) dist_est = cc.predict(X_test_new) err = rms(dist_est, dist_true) #print dist_est print "%f\t%f" % (dist_true[1], dist_est[1])
def test_class_ratio(self): ''' Compare several competing methods changing the ratio of the positive class in the dataset. We use binary class dataset for the easy of interpretation. ''' #X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy() X_train_full, y_train_full, X_test, y_test = news_20_reader.toNumpy() set_size = 1000 X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) ms = MSHI(LinearSVC) ms.fit(train_set) print 'Done training' for r in numpy.arange(0.05, 1.0, 0.05): #for r in [0.05]: X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] dist_true = DE.arrayToDist(y_test_new) dist_est = ms.predict(X_test_new) err = rms(dist_est, dist_true) print "r: %f, pos: %f" % (r, dist_est[1])
def run_training_size(self, pos_ratio): X_train_full, y_train_full, X_test, y_test = rcv1_binary_reader.toNumpy() test_set_original = [X_test, y_test] X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, pos_ratio, pos_label=1) test_set = [X_test_new, y_test_new] print "We compare performance as chaning the training set size." print "Positive class ratio is %f" % pos_ratio print "size\tcc\tac\tms\tra\trc\trb\trd" for set_size in (numpy.arange(60, 100, 10).tolist() + numpy.arange(100, 1100, 100).tolist() + [2000, 3000, 4000, 5000, 10000, 20000]): cc = CC2(LogisticRegression) ac = AC2(LogisticRegression) ms = MS2(LogisticRegression) ra = RA(LogisticRegression, ac_method = 'ac') rc = RA(LogisticRegression, ac_method = 'cac') rb = RA(LogisticRegression, ac_method = 'bac') rd = RA(LogisticRegression, ac_method = 'dac') ests = [cc, ac, ms, ra, rc, rb, rd] X_train_sub, y_train_sub = self.get_sub_set_with_size( [X_train_full, y_train_full], set_size) train_set = [X_train_sub, y_train_sub] map(lambda e: e.fit(train_set), ests) errs = map(lambda e: self.run_for_estimator(e, test_set), ests) print ("%d" + "\t%.4f" * 7) % (set_size, errs[0], errs[1], errs[2], errs[3], errs[4], errs[5], errs[6])
def test(self): X_train, y_train, X_test, y_test = news_20_reader.toNumpy() X_test, y_test = SetGen.with_pos_ratio([X_test, y_test], 0.50, pos_label=1) clf = LogisticRegression it = Itr2(clf, 2) it.fit([X_train, y_train]) dist = it.predict(X_test) print dist
def _test_debug(self): X_train_full, y_train_full, X_test, y_test = snippet_reader.toNumpy() set_size = 500 # an arbitrary number X_train, y_train = self.get_sub_set_with_size( [X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, 0.5, pos_label=1) test_set = [X_test_new, y_test_new] en = EN(LogisticRegression, debug=True) self.run_for_estimator(en, train_set, test_set, debug=True)
def _test_debug(self): X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy() set_size = 300 # an arbitrary number X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size) assert len(y_train) == set_size train_set = (X_train, y_train) test_set_original = (X_test, y_test) X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, 0.05, pos_label=1) test_set = [X_test_new, y_test_new] cc = CC(LogisticRegression) en = EN(LogisticRegression, debug=True) err = self.run_for_estimator(en, train_set, test_set, debug=True) print "err", err
def compute_avg_error(self, estimator, test_set, pos_label=1): """ @param X_train and y_train are used for training distribution estimator. @param estimator a class distribution estimator. should have method estimate() @param X_test and y_test are used to test the estimation method. """ errors = [] # run changing ratio of positive class for r in numpy.arange(0.05, 1.0, 0.05): X_test_new, y_test_new = SetGen.with_pos_ratio(test_set, r, pos_label=1) test_set_new = [X_test_new, y_test_new] err = self.run_for_estimator(estimator, test_set_new) errors.append(err) return numpy.mean(errors)
def run_ratio(self, dataset, set_size): ''' Compare several competing methods changing the ratio of the positive class in the dataset. We use binary class dataset for the easy of interpretation. ''' X_train_full, y_train_full, X_test_full, y_test_full = dataset X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) train_set = (X_train, y_train) test_set_original = (X_test_full, y_test_full) #clf_class = LogisticRegression def rf_factory(): clf = RandomForestClassifier(n_estimators=500) return clf clf_class = rf_factory cc = CC2(clf_class) ac = AC2(clf_class) ms = MS2(clf_class) it = Itr2(clf_class, itr_count = 2) ra = RA(clf_class, ac_method = 'ac', subsample_count = 500) ra3 = RA3(clf_class, subsample_count = 500) #en = EN7(clf_class) ests = [cc, ac, ms, ra3] print "We compare the performance as changing the positive class ratio." print "The training set size is %d" % set_size print "Training classifiers" map(lambda e: e.fit(train_set), ests) print "ratio\tcc\tac\tms\tra3" for r in numpy.arange(0.05, 1.0, 0.05): #for r in [0.5]: X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=0) test_set = [X_test_new, y_test_new] errs = map(lambda e: self.run_for_estimator(e, test_set), ests) print ("%.2f" + "\t%.4f" * len(ests)) % tuple([r] + errs)
def run_ratio(self, dataset, set_size): ''' Compare several competing methods changing the ratio of the positive class in the dataset. We use binary class dataset for the easy of interpretation. ''' X_train_full, y_train_full, X_test, y_test = dataset X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) clf_class = LinearSVC cc = CC2(clf_class) ac = AC2(clf_class) ms = MS2(clf_class) it = Itr2(clf_class, itr_count = 2) ra = RA(clf_class, ac_method = 'ac', subsample_count = 400) ra3 = RA3(clf_class, split_r = 0.4, subsample_count = 1000) #en = EN7(clf_class) #ests = [cc, ac, ms, it, ra, ra3] ests = [cc, ac, it, ra3] print "We compare the performance as changing the positive class ratio." print "The training set size is %d" % set_size print "Training classifiers" map(lambda e: e.fit(train_set), ests) print "ratio\tcc\tac\tit\tra3" for r in numpy.arange(0.05, 1.0, 0.05): X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] errs = map(lambda e: self.run_for_estimator(e, test_set), ests) print ("%.2f" + "\t%.4f" * len(ests)) % tuple([r] + errs)
def _test_rna_change_training_size(self): ''' Do compare using ncRNA dataset. The dataset is from Andrew V Uzilov, Joshua M Keegan, and David H Mathews. Detection of non-coding RNAs on the basis of predicted secondary structure formation free energy change. BMC Bioinformatics, 7(173), 2006. ''' X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy() test_set_original = [X_test, y_test] pos_ratio = 0.8 # arbtrary ratio X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, pos_ratio, pos_label=1) test_set = [X_test_new, y_test_new] print "We compare performance as chaning the training set size." print "Positive class ratio is %f" % pos_ratio print "ratio\tcc\tac\tms\ten" #for set_size in [600, 300, 400, 500, 700, 800, 900, 1000]: for set_size in [800, 900, 1000, 1500, 2000, 2500, 3000]: #for set_size in numpy.arange(1500, 5000, 500): #for set_size in [500, 600, 700, 800, 900, 1000]: cc = CC2(LogisticRegression) ac = AC2(LogisticRegression) ms = MS2(LogisticRegression) en = EN2(LogisticRegression) ests = [cc, ac, ms, en] X_train_sub, y_train_sub = self.get_sub_set_with_size( [X_train_full, y_train_full], set_size) train_set = [X_train_sub, y_train_sub] map(lambda e: e.fit(train_set), ests) errs = map(lambda e: self.run_for_estimator(e, test_set), ests) print ("%d" + "\t%.4f" * 4) % (set_size, errs[0], errs[1], errs[2], errs[3])
def _test_change_training_size(self): ''' Do compare using ncRNA dataset. The dataset is from Andrew V Uzilov, Joshua M Keegan, and David H Mathews. Detection of non-coding RNAs on the basis of predicted secondary structure formation free energy change. BMC Bioinformatics, 7(173), 2006. ''' X_train_full, y_train_full, X_test, y_test = snippet_reader.toNumpy() test_set_original = [X_test, y_test] pos_ratio = 0.7 # arbtrary ratio X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, pos_ratio, pos_label=1) test_set = [X_test_new, y_test_new] cc = CC(LogisticRegression) ac = AC(LogisticRegression) ms = MS(LogisticRegression) it = Itr(LogisticRegression) en = EN(LogisticRegression) ests = [cc, ac, ms, it, en] print "We compare performance as chaning the training set size." print "Fixed positive ratio is %f" % pos_ratio print "size\tcc\tac\tms\tit\ten" for set_size in [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000]: X_train_sub, y_train_sub = self.get_sub_set_with_size( [X_train_full, y_train_full], set_size) train_set = [X_train_sub, y_train_sub] en.find_hyperparameter(train_set) errs = map(lambda e: self.run_for_estimator(e, train_set, test_set), ests) print ("%d" + "\t%.4f" * 5) % (set_size, errs[0], errs[1], errs[2], errs[3], errs[4])
def run_training_size(self, pos_ratio): """ Do compare using ncRNA dataset. The dataset is from Andrew V Uzilov, Joshua M Keegan, and David H Mathews. Detection of non-coding RNAs on the basis of predicted secondary structure formation free energy change. BMC Bioinformatics, 7(173), 2006. """ X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy() test_set_original = [X_test, y_test] X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, pos_ratio, pos_label=1) test_set = [X_test_new, y_test_new] print "RNA dataset" print "We compare performance as chaning the training set size." print "Positive class ratio is %f" % pos_ratio print "size\tcc\tac\tms\tra\trc\trb\trd" for set_size in numpy.arange(100, 1100, 100).tolist() + [2000, 3000, 4000, 5000, 10000, 20000]: cc = CC2(LogisticRegression) ac = AC2(LogisticRegression) ms = MS2(LogisticRegression) ra = RA(LogisticRegression, ac_method="ac") rc = RA(LogisticRegression, ac_method="cac") rb = RA(LogisticRegression, ac_method="bac") rd = RA(LogisticRegression, ac_method="dac") ests = [cc, ac, ms, ra, rc, rb, rd] X_train_sub, y_train_sub = self.get_sub_set_with_size([X_train_full, y_train_full], set_size) train_set = [X_train_sub, y_train_sub] map(lambda e: e.fit(train_set), ests) errs = map(lambda e: self.run_for_estimator(e, test_set), ests) print ("%d" + "\t%.4f" * 7) % (set_size, errs[0], errs[1], errs[2], errs[3], errs[4], errs[5], errs[6])
def compare_based2(self, clf_class, data_set, ratio): r = ratio X_train, y_train, X_test, y_test = data_set train_set = [X_train, y_train] full_test_set = [X_test, y_test] cc = CC2(clf_class) ac = AC2(clf_class) ms = MS2(clf_class) it = Itr2(clf_class, itr_count = 2) bac = BAC(clf_class, subsample_count = 400) ests = [cc, ac, ms, it, bac] #print "Training Distribution Estimators" map(lambda e: e.fit(train_set), ests) # Generate a new test set with desired positive proportions. test_set = SetGen.with_pos_ratio(full_test_set, r, pos_label=1) errs = map(lambda e: self.run_for_estimator(e, test_set), ests) #print ("%.2f" + "\t%.4f" * len(ests)) % tuple([X_train.shape[0]] + errs) return errs