def test_ratio(self): dataset = rcv1_binary_reader.toNumpy() set_size = 100 for set_size in numpy.arange(100, 200, 100): print "Training set size is", set_size self.run_ratio(dataset, set_size) print
def test_ratio(): ''' Compare several competing methods changing the ratio of the positive class in the dataset. We use binary class dataset for the easy of interpretation. ''' dataset = rcv1_binary_reader.toNumpy() set_size = 100 X_train_full, y_train_full, X_test_full, y_test_full = dataset X_train, y_train = get_sub_set_with_size([X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) X_test, y_test = get_sub_set_with_size([X_test_full, y_test_full], 10000) train_set = (X_train, y_train) test_set_original = (X_test, y_test) save_libsvm(X_train, y_train, 'rcv_train_%d.libsvm' % set_size) #for r in np.arange(0.05, 1.0, 0.05): r = 0.05 # Generate a new test set with desired positive proportions. X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] save_libsvm(X_test_new, y_test_new, 'rcv_test_%.2f.libsvm' % r)
def test_ratio(self): ''' Compare several competing methods changing the ratio of the positive class in the dataset. We use binary class dataset for the easy of interpretation. ''' dataset = rcv1_binary_reader.toNumpy() set_size = 100 X_train_full, y_train_full, X_test, y_test = dataset X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) clf = LogisticRegression() clf.fit(X_train, y_train) p = Prior(clf) for r in np.arange(0.05, 1.0, 0.05): # Generate a new test set with desired positive proportions. X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] true_pos = DE.arrayToDist(y_test_new)[1] p.fit(X_train, y_train, {-1:1-true_pos, 1:true_pos}) y_pred = p.predict(X_test_new) cm = confusion_matrix(y_test_new, y_pred) acc = self.accuracy(cm) print r, acc
def run_training_size(self, pos_ratio): X_train_full, y_train_full, X_test, y_test = rcv1_binary_reader.toNumpy() test_set_original = [X_test, y_test] X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, pos_ratio, pos_label=1) test_set = [X_test_new, y_test_new] print "We compare performance as chaning the training set size." print "Positive class ratio is %f" % pos_ratio print "size\tcc\tac\tms\tra\trc\trb\trd" for set_size in (numpy.arange(60, 100, 10).tolist() + numpy.arange(100, 1100, 100).tolist() + [2000, 3000, 4000, 5000, 10000, 20000]): cc = CC2(LogisticRegression) ac = AC2(LogisticRegression) ms = MS2(LogisticRegression) ra = RA(LogisticRegression, ac_method = 'ac') rc = RA(LogisticRegression, ac_method = 'cac') rb = RA(LogisticRegression, ac_method = 'bac') rd = RA(LogisticRegression, ac_method = 'dac') ests = [cc, ac, ms, ra, rc, rb, rd] X_train_sub, y_train_sub = self.get_sub_set_with_size( [X_train_full, y_train_full], set_size) train_set = [X_train_sub, y_train_sub] map(lambda e: e.fit(train_set), ests) errs = map(lambda e: self.run_for_estimator(e, test_set), ests) print ("%d" + "\t%.4f" * 7) % (set_size, errs[0], errs[1], errs[2], errs[3], errs[4], errs[5], errs[6])
def _test_ac_forest(self): X_train_full, y_train_full, X_test, y_test = rcv1_binary_reader.toNumpy() set_size = 200 # an arbitrary number X_train, y_train= self.get_sub_set_with_size([X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) r = 0.8 X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] dist_true = DE.arrayToDist(y_test_new) pos = [] for i in range(500): train_sub = self.get_sub_set_with_size(train_set, 0.5) ac = AC2(LogisticRegression) ac.fit(train_sub) dist_est = ac.predict(X_test_new) pos.append(dist_est[1]) print pos print numpy.mean(pos) print numpy.median(pos)
def _test_random_forest_based(self): dataset = rcv1_binary_reader.toNumpy() n_feature = 5000 print "Dataset: RCV1, Classifier: Random Forest" print #self.run_ratio(RFWrapper, dataset, n_feature) self.run_size(RFWrapper, dataset, n_feature)
def _test_rcv1_binary_chainging_size(self): X_train, y_train, X_test, y_test = rcv1_binary_reader.toNumpy() train_set = (X_train, y_train) #for size in numpy.arange(1000, 10000, 1000): for size in numpy.arange(100, 1000, 100): X_sub, y_sub = self.get_sub_set_with_size(train_set, size) clf = LinearSVC() clf.fit(X_sub, y_sub) print "size: %d, accuracy: %f" % (size, clf.score(X_test, y_test))
def _test_rcv1_binary_dataset(self): X_train, y_train, X_test, y_test = rcv1_binary_reader.toNumpy() clf = LogisticRegression() clf.fit(X_train, y_train) print "logistic score", clf.score(X_test, y_test) clf = LinearSVC() clf.fit(X_train, y_train) print "svm score", clf.score(X_test, y_test)
def test_ratio(self): ''' Compare several competing methods changing the ratio of the positive class in the dataset. We use binary class dataset for the easy of interpretation. ''' dataset = rcv1_binary_reader.toNumpy() set_size = 100 X_train_full, y_train_full, X_test_full, y_test_full = dataset X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) X_test, y_test = self.get_sub_set_with_size([X_test_full, y_test_full], 10000) train_set = (X_train, y_train) test_set_original = (X_test, y_test) rfw = RFWeights() svmw = SVMWeights() rf = RandomForestClassifier(n_estimators=400) svm = LinearSVC() rf.fit(X_train.toarray(), y_train) svm.fit(X_train, y_train) print "Ratio\tSVM\tSVMW\tRF\tRFW" for r in np.arange(0.05, 1.0, 0.05): # Generate a new test set with desired positive proportions. X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] true_pos = DE.to_bin_dist(y_test_new)[1] new_class_dist = {0:1-true_pos, 1:true_pos} rfw.fit(X_train, y_train, new_class_dist) svmw.fit(X_train, y_train, new_class_dist) svm_pred = svm.predict(X_test_new) svmw_pred = svmw.predict(X_test_new) rf_pred = rf.predict(X_test_new.toarray()) rfw_pred = rfw.predict(X_test_new.toarray()) preds = [svm_pred, svmw_pred, rf_pred, rfw_pred] pos_ratios = map(lambda x: DE.to_bin_dist(x)[1], preds) print ("%.2f" + "\t%.2f" * len(pos_ratios)) % tuple([r] + pos_ratios)
def _test_avg(self): dataset = rcv1_binary_reader.toNumpy() train_set_size = 300 X_train_full, y_train_full, X_test_full, y_test_full = dataset X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], train_set_size) X_test, y_test = self.get_sub_set_with_size([X_test_full, y_test_full], 10000) train_set = (X_train, y_train) test_set_original = (X_test, y_test) clf_class = LogisticRegression for split_r in numpy.arange(0.1, 1.0, 0.1): ra = RA(clf_class, ac_method = 'ac', subsample_count = 200, split_r=split_r) ra.fit(train_set) err = self.compute_avg_error(ra, test_set_original) print split_r, err
def test_ratio(self): dataset = rcv1_binary_reader.toNumpy() for set_size in numpy.arange(50, 200, 10): self.run_ratio(dataset, set_size) print
def _test_random_forest_based(self): dataset = rcv1_binary_reader.toNumpy() n_feature = 5000 print "RCV1 with RF" self.run_test_with(dataset, RFWrapper, n_feature, dense=True)
def test_svm_based4(self): dataset = rcv1_binary_reader.toNumpy() n_feature = 5000 print "RCV1 with SVM" self.run_test_with(dataset, LinearSVC, n_feature)
def _test_rf_based(self): print "Compare RCV1 with RF" dataset = rcv1_binary_reader.toNumpy() n_feature = 5000 self.run_test_with(dataset, self.compare_rf_based, n_feature)