示例#1
0
文件: prior.py 项目: pyongjoo/ende
    def test_ratio(self):
        '''
        Compare several competing methods changing the ratio of the positive
        class in the dataset. We use binary class dataset for the easy of
        interpretation.
        '''
        dataset = rcv1_binary_reader.toNumpy()
        set_size = 100

        X_train_full, y_train_full, X_test, y_test = dataset
        X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
        assert(len(y_train) == set_size)

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)

        clf = LogisticRegression()
        clf.fit(X_train, y_train)

        p = Prior(clf)

        for r in np.arange(0.05, 1.0, 0.05):
            # Generate a new test set with desired positive proportions.
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
            test_set = [X_test_new, y_test_new]

            true_pos = DE.arrayToDist(y_test_new)[1]

            p.fit(X_train, y_train, {-1:1-true_pos, 1:true_pos})
            y_pred = p.predict(X_test_new)
            cm = confusion_matrix(y_test_new, y_pred)
            acc = self.accuracy(cm)

            print r, acc
示例#2
0
文件: weights.py 项目: pyongjoo/ende
    def test_ratio(self):
        '''
        Compare several competing methods changing the ratio of the positive
        class in the dataset. We use binary class dataset for the easy of
        interpretation.
        '''
        dataset = rcv1_binary_reader.toNumpy()
        set_size = 100

        X_train_full, y_train_full, X_test_full, y_test_full = dataset
        X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
        assert(len(y_train) == set_size)
        X_test, y_test = self.get_sub_set_with_size([X_test_full, y_test_full], 10000)

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)

        rfw = RFWeights()
        svmw = SVMWeights()

        rf = RandomForestClassifier(n_estimators=400)
        svm = LinearSVC()

        rf.fit(X_train.toarray(), y_train)
        svm.fit(X_train, y_train)

        print "Ratio\tSVM\tSVMW\tRF\tRFW"
        for r in np.arange(0.05, 1.0, 0.05):
            # Generate a new test set with desired positive proportions.
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
            test_set = [X_test_new, y_test_new]

            true_pos = DE.to_bin_dist(y_test_new)[1]
            new_class_dist = {0:1-true_pos, 1:true_pos}

            rfw.fit(X_train, y_train, new_class_dist)
            svmw.fit(X_train, y_train, new_class_dist)

            svm_pred = svm.predict(X_test_new)
            svmw_pred = svmw.predict(X_test_new)
            rf_pred = rf.predict(X_test_new.toarray())
            rfw_pred = rfw.predict(X_test_new.toarray())

            preds = [svm_pred, svmw_pred, rf_pred, rfw_pred]
            pos_ratios = map(lambda x: DE.to_bin_dist(x)[1], preds)

            print ("%.2f" + "\t%.2f" * len(pos_ratios)) % tuple([r] + pos_ratios)
示例#3
0
文件: mla.py 项目: pyongjoo/ende
    def test_ratio(self):
        '''
        Compare several competing methods changing the ratio of the positive
        class in the dataset. We use binary class dataset for the easy of
        interpretation.
        '''
        #dataset = rcv1_binary_reader.toNumpy()
        #dataset = snippet_reader.toNumpy()
        dataset = sentiment_reader.toNumpy()
        #set_size = 200
        #X_train_full, y_train_full, X_test, y_test = dataset
        #X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
        #assert(len(y_train) == set_size)

        X_train, y_train, X_test, y_test = dataset

        X_test = X_test[:1000]
        y_test = y_test[:1000]

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)

        clf = SVMLight()
        #clf = LinearSVC()
        clf.fit(X_train, y_train)


        mla = MLA(clf, verbose=1)

        for r in np.arange(0.05, 1.0, 0.05):
        #r = 0.1

            # Generate a new test set with desired positive proportions.
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
            test_set = [X_test_new, y_test_new]

            dist_dict = DE.arrayToDistDict(y_test_new)

            mla.fit(X_train, y_train, dist_dict)
            y_pred = mla.predict(X_test_new)
            cm = confusion_matrix(y_test_new, y_pred)
            acc = self.accuracy(cm)

            print r, acc
示例#4
0
文件: comp.py 项目: pyongjoo/ende
    def compare_svm_based_repeat(self, data_set):
        X_train, y_train, X_test, y_test = data_set

        prob_estimator = LinearSVC()
        prob_estimator.fit(X_train, y_train)

        w = SVMWeights()
        #p = Prior(prob_estimator)
        m = MLT(prob_estimator)

        ests = [w, m]

        acc_matrix = []
        f1_matrix = []
        auc_matrix = []

        #print "Ratio\tSVM\tSVMW\tPrior\tMLA"
        for r in np.arange(0.1, 1.0, 0.1):
            repeat_num = 20

            for repeat in range(repeat_num):
                # Generate a new test set with desired positive proportions.
                X_test_new, y_test_new = SetGen.with_pos_ratio([X_test, y_test], r, pos_label=1)

                class_dist = DE.arrayToDistDict(y_test_new)

                map(lambda x: x.fit(X_train, y_train, class_dist), ests)
                y_preds = map(lambda x: x.predict(X_test_new), [prob_estimator] + ests)
                cms = map(lambda x: confusion_matrix(y_test_new, x), y_preds)

                accs = map(self.accuracy, cms)
                f1s = map(self.f1, cms)
                auc = map(self.auc, cms)
                acc_matrix.append(accs)
                f1_matrix.append(f1s)
                auc_matrix.append(auc)

                #print ("%.2f" + "\t%.4f" * len(accs)) % tuple([r] + accs)
                print r
                print accs
                print f1s
                print

        return acc_matrix, f1_matrix, auc_matrix
示例#5
0
文件: comp.py 项目: pyongjoo/ende
    def compare_rf_based(self, data_set):
        X_train, y_train, X_test, y_test = data_set

        # TODO: We actually need to convert to dense array using toarray()
        # TODO: Satimage data is the only exception.
        prob_estimator = RandomForestClassifier(n_estimators=200)
        prob_estimator.fit(X_train, y_train)

        w = RFWeights(n_estimators=200)
        p = Prior(prob_estimator)
        m = MLT(prob_estimator)

        ests = [w, p, m]

        acc_matrix = []
        f1_matrix = []
        auc_matrix = []

        #print "Ratio\tRF\tRFW\tPrior\tMLA"
        for r in np.arange(0.2, 1.0, 0.2):
            # Generate a new test set with desired positive proportions.
            X_test_new, y_test_new = SetGen.with_pos_ratio([X_test, y_test], r, pos_label=1)

            class_dist = DE.arrayToDistDict(y_test_new)


            # TODO: We actually need to convert to dense array using toarray()
            # TODO: Satimage data is the only exception.

            map(lambda x: x.fit(X_train, y_train, class_dist), ests)
            y_preds = map(lambda x: x.predict(X_test_new), [prob_estimator] + ests)
            cms = map(lambda x: confusion_matrix(y_test_new, x), y_preds)

            accs = map(self.accuracy, cms)
            f1s = map(self.f1, cms)
            auc = map(self.auc, cms)
            acc_matrix.append(accs)
            f1_matrix.append(f1s)
            auc_matrix.append(auc)

            #print ("%.2f" + "\t%.4f" * len(accs)) % tuple([r] + accs)

        return acc_matrix, f1_matrix, auc_matrix
示例#6
0
文件: comp.py 项目: pyongjoo/ende
    def compare_maxent_based(self, data_set):
        X_train, y_train, X_test, y_test = data_set

        prob_estimator = LogisticRegression()
        prob_estimator.fit(X_train, y_train)

        w = MaxentWeights()
        p = Prior(prob_estimator)
        m = MLT(prob_estimator)

        ests = [w, p, m]

        acc_matrix = []
        f1_matrix = []
        auc_matrix = []

        #print "Ratio\tME\tMEW\tPrior\tMLA"
        for r in np.arange(0.2, 1.0, 0.2):
            # Generate a new test set with desired positive proportions.
            X_test_new, y_test_new = SetGen.with_pos_ratio([X_test, y_test], r, pos_label=1)

            class_dist = DE.arrayToDistDict(y_test_new)

            map(lambda x: x.fit(X_train, y_train, class_dist), ests)
            y_preds = map(lambda x: x.predict(X_test_new), [prob_estimator] + ests)
            cms = map(lambda x: confusion_matrix(y_test_new, x), y_preds)

            accs = map(self.accuracy, cms)
            f1s = map(self.f1, cms)
            auc = map(self.auc, cms)
            acc_matrix.append(accs)
            f1_matrix.append(f1s)
            auc_matrix.append(auc)

            #print ("%.2f" + "\t%.4f" * len(accs)) % tuple([r] + accs)

        return acc_matrix, f1_matrix, auc_matrix
示例#7
0
文件: en_mlt.py 项目: pyongjoo/ende
 def run_for_estimator(self, estimator, test_set):
     X_test, y_test = test_set
     dist = estimator.predict(X_test)
     err = DE.rms(y_test, dist)
     return err