Exemplo n.º 1
0
    def test_binary_relevance_lr(self):
        br = BinaryRelevance(base_clf=LogisticRegression(random_state=1126))
        br.train(Dataset(self.X_train, self.Y_train))

        br_pred_train = br.predict(self.X_train).astype(int)
        br_pred_test = br.predict(self.X_test).astype(int)

        br_pred_proba_train = br.predict_proba(self.X_train).astype(float)
        br_pred_proba_test = br.predict_proba(self.X_test).astype(float)

        for i in range(np.shape(self.Y_train)[1]):
            clf = sklearn.linear_model.LogisticRegression(random_state=1126)
            clf.fit(self.X_train, self.Y_train[:, i])

            assert_array_equal(clf.predict(self.X_train).astype(int),
                               br_pred_train[:, i])
            assert_array_equal(clf.predict(self.X_test).astype(int),
                               br_pred_test[:, i])

            assert_array_equal(clf.predict_proba(self.X_train)[:, 1].astype(float),
                               br_pred_proba_train[:, i])
            assert_array_equal(clf.predict_proba(self.X_test)[:, 1].astype(float),
                               br_pred_proba_test[:, i])

        self.assertEqual(
            np.mean(np.abs(self.Y_test - br_pred_test).mean(axis=1)),
            br.score(Dataset(self.X_test, self.Y_test), 'hamming'))

        self.assertRaises(NotImplementedError,
                lambda: br.score(Dataset(self.X_test, self.Y_test),
                                 criterion='not_exist'))
Exemplo n.º 2
0
    def test_binary_relevance_parallel(self):
        br = BinaryRelevance(base_clf=LogisticRegression(random_state=1126),
                             n_jobs=1)
        br.train(Dataset(self.X_train, self.Y_train))
        br_par = BinaryRelevance(
                base_clf=LogisticRegression(random_state=1126), n_jobs=2)
        br_par.train(Dataset(self.X_train, self.Y_train))

        assert_array_equal(br.predict(self.X_test).astype(int),
                           br_par.predict(self.X_test).astype(int))
Exemplo n.º 3
0
    def test_binary_relevance_lr(self):
        br = BinaryRelevance(base_clf=LogisticRegression(random_state=1126))
        br.train(Dataset(self.X_train, self.Y_train))

        br_pred_train = br.predict(self.X_train).astype(int)
        br_pred_test = br.predict(self.X_test).astype(int)

        br_pred_proba_train = br.predict_proba(self.X_train).astype(float)
        br_pred_proba_test = br.predict_proba(self.X_test).astype(float)

        for i in range(np.shape(self.Y_train)[1]):
            clf = sklearn.linear_model.LogisticRegression(random_state=1126)
            clf.fit(self.X_train, self.Y_train[:, i])

            assert_array_equal(clf.predict(self.X_train).astype(int),
                               br_pred_train[:, i])
            assert_array_equal(clf.predict(self.X_test).astype(int),
                               br_pred_test[:, i])

            assert_array_equal(clf.predict_proba(self.X_train)[:, 1].astype(float),
                               br_pred_proba_train[:, i])
            assert_array_equal(clf.predict_proba(self.X_test)[:, 1].astype(float),
                               br_pred_proba_test[:, i])

        self.assertEqual(
            np.mean(np.abs(self.Y_test - br_pred_test).mean(axis=1)),
            br.score(Dataset(self.X_test, self.Y_test), 'hamming'))

        self.assertRaises(NotImplementedError,
                lambda: br.score(Dataset(self.X_test, self.Y_test),
                                 criterion='not_exist'))
Exemplo n.º 4
0
 def test_multilabel_with_auxiliary_learner_mmr(self):
     trn_ds = Dataset(self.X,
                      self.y[:5] + [None] * (len(self.y) - 5))
     qs = MultilabelWithAuxiliaryLearner(trn_ds,
             major_learner=BinaryRelevance(LogisticRegression(solver='liblinear',
                                                              multi_class="ovr")),
             auxiliary_learner=BinaryRelevance(SVM(gamma="auto")),
             criterion='mmr',
             random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(qseq,
             np.array([1258, 1461, 231, 1198, 1498, 1374, 955, 1367, 265, 144]))
 def test_multilabel_with_auxiliary_learner_hlr(self):
     trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5))
     qs = MultilabelWithAuxiliaryLearner(
         trn_ds,
         major_learner=BinaryRelevance(LogisticRegression()),
         auxiliary_learner=BinaryRelevance(SVM()),
         criterion='hlr',
         random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq,
         np.array([701, 1403, 147, 897, 974, 1266, 870, 703, 292, 1146]))
Exemplo n.º 6
0
 def test_multilabel_with_auxiliary_learner_shlr(self):
     trn_ds = Dataset(self.X,
                      self.y[:5] + [None] * (len(self.y) - 5))
     qs = MultilabelWithAuxiliaryLearner(trn_ds,
             major_learner=BinaryRelevance(LogisticRegression(solver='liblinear',
                                                              multi_class="ovr")),
             auxiliary_learner=BinaryRelevance(SVM(gamma="auto")),
             criterion='shlr',
             b=1.,
             random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(qseq,
             np.array([1258, 805, 459, 550, 783, 964, 736, 1004, 38, 750]))
Exemplo n.º 7
0
    def test_binary_relevance_parallel(self):
        br = BinaryRelevance(base_clf=LogisticRegression(random_state=1126),
                             n_jobs=1)
        br.train(Dataset(self.X_train, self.Y_train))
        br_par = BinaryRelevance(
                base_clf=LogisticRegression(random_state=1126), n_jobs=2)
        br_par.train(Dataset(self.X_train, self.Y_train))

        assert_array_equal(br.predict(self.X_test).astype(int),
                           br_par.predict(self.X_test).astype(int))
Exemplo n.º 8
0
    def make_query(self):
        dataset = self.dataset
        labeled_pool, Y = zip(*dataset.get_labeled_entries())
        unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries())
        labeled_pool = np.array(labeled_pool)
        Y = np.array(Y)
        X_pool = np.array(X_pool)

        br = BinaryRelevance(self.br_base)
        br.train(Dataset(labeled_pool, Y))

        trnf = br.predict_proba(labeled_pool)
        poolf = br.predict_proba(X_pool)
        f = poolf * 2 - 1

        trnf = np.sort(trnf, axis=1)[:, ::-1]
        trnf /= np.tile(trnf.sum(axis=1).reshape(-1, 1), (1, trnf.shape[1]))
        if len(np.unique(Y.sum(axis=1))) == 1:
            lr = DummyClf()
        else:
            lr = self.logistic_regression_
        lr.train(Dataset(trnf, Y.sum(axis=1)))

        idx_poolf = np.argsort(poolf, axis=1)[:, ::-1]
        poolf = np.sort(poolf, axis=1)[:, ::-1]
        poolf /= np.tile(poolf.sum(axis=1).reshape(-1, 1), (1, poolf.shape[1]))
        pred_num_lbl = lr.predict(poolf).astype(int)

        yhat = -1 * np.ones((len(X_pool), self.n_labels), dtype=int)
        for i, p in enumerate(pred_num_lbl):
            yhat[i, idx_poolf[i, :p]] = 1

        score = ((1 - yhat * f) / 2).sum(axis=1)
        ask_id = self.random_state_.choice(np.where(score == np.max(score))[0])
        return unlabeled_entry_ids[ask_id]
Exemplo n.º 9
0
 def test_cost_sensitive_random_pair_encoding(self):
     trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5))
     model = BinaryRelevance(LogisticRegression(solver='liblinear',
                                                multi_class="ovr"))
     base_model = LogisticRegression(
             solver='liblinear', multi_class="ovr", random_state=1126)
     qs = CostSensitiveReferencePairEncoding(
             trn_ds,
             scoring_fn=pairwise_f1_score,
             model=model,
             base_model=base_model,
             n_models=10,
             n_jobs=1,
             random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(qseq,
             np.array([149, 434, 1126, 719, 983, 564, 816, 732, 101, 1242]))
Exemplo n.º 10
0
    def make_query(self):
        dataset = self.dataset
        X, Y = dataset.get_labeled_entries()
        unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries()
        Y = np.array(Y)
        X, X_pool = np.array(X), np.array(X_pool)

        clf = BinaryRelevance(self.base_clf, n_jobs=self.n_jobs)
        clf.train(dataset)
        real = clf.predict_real(X_pool)
        pred = clf.predict(X_pool)

        # Separation Margin
        pos = np.copy(real)
        pos[real <= 0] = np.inf
        neg = np.copy(real)
        neg[real >= 0] = -np.inf
        separation_margin = pos.min(axis=1) - neg.max(axis=1)
        uncertainty = 1. / separation_margin

        # Label Cardinality Inconsistency
        average_pos_lbl = Y.mean(axis=0).sum()
        label_cardinality = np.sqrt((pred.sum(axis=1) - average_pos_lbl)**2)

        candidate_idx_set = set()
        for b in self.betas:
            # score shape = (len(X_pool), )
            score = uncertainty**b * label_cardinality**(1. - b)
            for idx in np.where(score == np.max(score))[0]:
                candidate_idx_set.add(idx)

        candidates = list(candidate_idx_set)
        approx_err = Parallel(n_jobs=self.n_jobs, backend='threading')(
            delayed(_calc_approx_err)(BinaryRelevance(self.base_clf),
                                      Dataset(np.vstack((X, X_pool[idx])),
                                              np.vstack((Y,
                                                         pred[idx]))), X_pool)
            for idx in candidates)

        #approx_err = []
        #for idx in candidates:
        #    ds = Dataset(np.vstack((X, X_pool[idx])), np.vstack((Y, pred[idx])))
        #    br = BinaryRelevance(self.base_clf)
        #    br.train(ds)
        #    br_real = br.predict_real(X_pool)

        #    pos = np.copy(br_real)
        #    pos[br_real<0] = 1
        #    pos = np.max((1.-pos), axis=1)

        #    neg = np.copy(br_real)
        #    neg[br_real>0] = -1
        #    neg = np.max((1.+neg), axis=1)

        #    err = neg + pos

        #    approx_err.append(np.sum(err))

        choices = np.where(np.array(approx_err) == np.min(approx_err))[0]
        ask_idx = candidates[self.random_state_.choice(choices)]

        return unlabeled_entry_ids[ask_idx]
Exemplo n.º 11
0
def main():
    test_size = 0.25  # the percentage of samples in the dataset that will be
    # randomly selected and assigned to the test set

    result = {'E1': [], 'E2': [], 'E3': [], 'E4': [], 'E5': [], 'E6': []}
    for i in range(10):  # repeat experiment
        trn_ds, tst_ds, fully_labeled_trn_ds = split_train_test(test_size)
        trn_ds2 = copy.deepcopy(trn_ds)
        trn_ds3 = copy.deepcopy(trn_ds)
        trn_ds4 = copy.deepcopy(trn_ds)
        trn_ds5 = copy.deepcopy(trn_ds)
        trn_ds6 = copy.deepcopy(trn_ds)
        lbr = IdealLabeler(fully_labeled_trn_ds)
        model = BinaryRelevance(LogisticRegression())

        quota = 150  # number of samples to query

        qs = MMC(trn_ds, br_base=LogisticRegression())
        _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota)
        result['E1'].append(E_out_1)

        qs2 = RandomSampling(trn_ds2)
        _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota)
        result['E2'].append(E_out_2)

        qs3 = MultilabelWithAuxiliaryLearner(trn_ds3,
                                             BinaryRelevance(
                                                 LogisticRegression()),
                                             BinaryRelevance(SVM()),
                                             criterion='hlr')
        _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota)
        result['E3'].append(E_out_3)

        qs4 = MultilabelWithAuxiliaryLearner(trn_ds4,
                                             BinaryRelevance(
                                                 LogisticRegression()),
                                             BinaryRelevance(SVM()),
                                             criterion='shlr')
        _, E_out_4 = run(trn_ds4, tst_ds, lbr, model, qs4, quota)
        result['E4'].append(E_out_4)

        qs5 = MultilabelWithAuxiliaryLearner(trn_ds5,
                                             BinaryRelevance(
                                                 LogisticRegression()),
                                             BinaryRelevance(SVM()),
                                             criterion='mmr')
        _, E_out_5 = run(trn_ds5, tst_ds, lbr, model, qs5, quota)
        result['E5'].append(E_out_5)

        qs6 = BinaryMinimization(trn_ds6, LogisticRegression())
        _, E_out_6 = run(trn_ds6, tst_ds, lbr, model, qs6, quota)
        result['E6'].append(E_out_6)

    E_out_1 = np.mean(result['E1'], axis=0)
    E_out_2 = np.mean(result['E2'], axis=0)
    E_out_3 = np.mean(result['E3'], axis=0)
    E_out_4 = np.mean(result['E4'], axis=0)
    E_out_5 = np.mean(result['E5'], axis=0)
    E_out_6 = np.mean(result['E6'], axis=0)

    print("MMC: ", E_out_1[::5].tolist())
    print("Random: ", E_out_2[::5].tolist())
    print("MultilabelWithAuxiliaryLearner_hlr: ", E_out_3[::5].tolist())
    print("MultilabelWithAuxiliaryLearner_shlr: ", E_out_4[::5].tolist())
    print("MultilabelWithAuxiliaryLearner_mmr: ", E_out_5[::5].tolist())
    print("BinaryMinimization: ", E_out_6[::5].tolist())

    query_num = np.arange(1, quota + 1)
    fig = plt.figure(figsize=(9, 6))
    ax = plt.subplot(111)
    ax.plot(query_num, E_out_1, 'g', label='MMC')
    ax.plot(query_num, E_out_2, 'k', label='Random')
    ax.plot(query_num, E_out_3, 'r', label='AuxiliaryLearner_hlr')
    ax.plot(query_num, E_out_4, 'b', label='AuxiliaryLearner_shlr')
    ax.plot(query_num, E_out_5, 'c', label='AuxiliaryLearner_mmr')
    ax.plot(query_num, E_out_6, 'm', label='BinaryMinimization')

    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.75, box.height])
    plt.legend(loc=2, bbox_to_anchor=(1.05, 1), borderaxespad=0.)
    plt.xlabel('Number of Queries')
    plt.ylabel('Loss')
    plt.title('Experiment Result (Hamming Loss)')
    plt.show()
Exemplo n.º 12
0
labeled_train = list(set(range(labeled_entries)).difference(test_index))

trn_ds = Dataset(X[train_index], Y.iloc[labeled_train].values.tolist() + [None]*(len(train_index) - len(labeled_train)))
tst_ds = Dataset(X[test_index],Y.iloc[test_index].values.tolist())


data_CV_train = data_CV_concat.iloc[train_index]

'''
MAIN FUNCTION
'''

result = {'Hamming': [],'F1': []}
    
model = BinaryRelevance(LogisticRegression())

quota = 20  # number of samples to query


#EXECUTE FROM HERE FOR ITERATIONS

qs1 = MultilabelWithAuxiliaryLearner(
trn_ds,
BinaryRelevance(LogisticRegression()),
BinaryRelevance(SVM()),
criterion='hlr')

run(data_CV_train,trn_ds, qs1, quota)

model.train(trn_ds)