Пример #1
0
 def test_uncertainty_entropy(self):
     trn_ds = init_toyexample(self.X, self.y)
     qs = UncertaintySampling(trn_ds,
                              method='entropy',
                              model=LogisticRegression())
     model = LogisticRegression()
     qseq = run_qs(trn_ds, self.lbr, model, qs, self.quota)
     assert_array_equal(qseq, np.array([6, 7, 8, 9]))
Пример #2
0
 def test_uncertainty_sm(self):
     trn_ds = init_toyexample(self.X, self.y)
     qs = UncertaintySampling(trn_ds,
                              method='sm',
                              model=LogisticRegression(solver='liblinear',
                                                       multi_class="ovr"))
     model = LogisticRegression(solver='liblinear', multi_class="ovr")
     qseq = run_qs(trn_ds, self.lbr, model, qs, self.quota)
     assert_array_equal(qseq, np.array([6, 7, 8, 9]))
Пример #3
0
    def test_binary_relevance_parallel(self):
        br = BinaryRelevance(base_clf=LogisticRegression(random_state=1126),
                             n_jobs=1)
        br.train(Dataset(self.X_train, self.Y_train))
        br_par = BinaryRelevance(
                base_clf=LogisticRegression(random_state=1126), n_jobs=2)
        br_par.train(Dataset(self.X_train, self.Y_train))

        assert_array_equal(br.predict(self.X_test).astype(int),
                           br_par.predict(self.X_test).astype(int))
Пример #4
0
def main():
    # Specifiy the parameters here:
    # path to your binary classification dataset
    dataset_filepath = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), 'diabetes.txt')
    test_size = 0.33    # the percentage of samples in the dataset that will be
    # randomly selected and assigned to the test set
    n_labeled = 10      # number of samples that are initially labeled

    # Load dataset
    trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \
        split_train_test(dataset_filepath, test_size, n_labeled)
    trn_ds2 = copy.deepcopy(trn_ds)
    trn_ds3 = copy.deepcopy(trn_ds)
    lbr = IdealLabeler(fully_labeled_trn_ds)

    quota = len(y_train) - n_labeled    # number of samples to query
    batch_size = 5

    # Comparing UncertaintySampling strategy with RandomSampling.
    # model is the base learner, e.g. LogisticRegression, SVM ... etc.
    # qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression(), n=batch_size)
    qs = US_dev(trn_ds, method='lc', model=LogisticRegression())
    model = LogisticRegression()
    E_in_1, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota, batch_size)

    # qs2 = RandomSampling(trn_ds2, n=batch_size)
    qs2 = RS_dev(trn_ds2)
    model = LogisticRegression()
    E_in_2, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota, batch_size)

    qs3 = KCenterGreedy(trn_ds3, transformer=None)
    model = LogisticRegression()
    E_in_3, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota, batch_size)

    # Plot the learning curve of UncertaintySampling to RandomSampling
    # The x-axis is the number of queries, and the y-axis is the corresponding
    # error rate.
    assert len(E_in_1) == len(E_in_2)
    query_num = np.arange(1, len(E_in_1) + 1)
    plt.plot(query_num, E_in_1, 'lightcoral', label='qs Ein')
    plt.plot(query_num, E_in_2, 'lightgreen', label='random Ein')
    plt.plot(query_num, E_in_3, 'lightsteelblue', label='k-center-greedy Ein')
    plt.plot(query_num, E_out_1, 'r', label='qs Eout')
    plt.plot(query_num, E_out_2, 'g', label='random Eout')
    plt.plot(query_num, E_out_3, 'b', label='k-center-greedy Eout')
    plt.xlabel('Number of Queries')
    plt.ylabel('Error')
    plt.title('Experiment Result')
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
               fancybox=True, shadow=True, ncol=5)
    plt.show()
Пример #5
0
 def test_QueryByCommittee(self):
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)]))
     qs = QueryByCommittee(trn_ds,
                           models=[
                               LogisticRegression(C=1.0),
                               LogisticRegression(C=0.01),
                               LogisticRegression(C=100)
                           ],
                           random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([267, 210, 229, 220, 134, 252, 222, 142, 245, 228]))
Пример #6
0
 def test_QueryByCommittee(self):
     #import ipdb; ipdb.set_trace()
     random.seed(1126)
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)]))
     qs = QueryByCommittee(trn_ds,
                           models=[
                               LogisticRegression(C=1.0),
                               LogisticRegression(C=0.01),
                               LogisticRegression(C=100)
                           ])
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([11, 207, 101, 30, 116, 108, 83, 172, 211, 42]))
Пример #7
0
 def test_query_by_committee_kl_divergence(self):
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)]))
     qs = QueryByCommittee(trn_ds,
                           disagreement='kl_divergence',
                           models=[
                               LogisticRegression(C=1.0),
                               LogisticRegression(C=0.01),
                               LogisticRegression(C=100)
                           ],
                           random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([228, 111, 162, 243, 213, 122, 110, 108, 156, 37]))
Пример #8
0
 def test_query_by_committee_vote(self):
     #self.skipTest("In this version we randomize make queries")
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)]))
     qs = QueryByCommittee(trn_ds,
                           disagreement='vote',
                           models=[
                               LogisticRegression(C=1.0),
                               LogisticRegression(C=0.01),
                               LogisticRegression(C=100)
                           ],
                           random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(qseq,
                        np.array([10, 12, 11, 13, 16, 14, 17, 18, 19, 21]))
Пример #9
0
 def test_binary_minimization(self):
     trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5))
     qs = BinaryMinimization(trn_ds, LogisticRegression(solver='liblinear', multi_class="ovr"),
                             random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(qseq,
             np.array([936, 924, 1211, 1286, 590, 429, 404, 962, 825, 30]))
Пример #10
0
 def test_ActiveLearningByLearning(self):
     np.random.seed(1126)
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)]))
     qs = ActiveLearningByLearning(trn_ds,
                                   T=self.quota,
                                   query_strategies=[
                                       UncertaintySampling(
                                           trn_ds,
                                           model=LogisticRegression()),
                                       HintSVM(trn_ds)
                                   ],
                                   model=LogisticRegression())
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([103, 220, 118, 75, 176, 50, 247, 199, 46, 55]))
Пример #11
0
    def test_binary_relevance_lr(self):
        br = BinaryRelevance(base_clf=LogisticRegression(random_state=1126))
        br.train(Dataset(self.X_train, self.Y_train))

        br_pred_train = br.predict(self.X_train).astype(int)
        br_pred_test = br.predict(self.X_test).astype(int)

        br_pred_proba_train = br.predict_proba(self.X_train).astype(float)
        br_pred_proba_test = br.predict_proba(self.X_test).astype(float)

        for i in range(np.shape(self.Y_train)[1]):
            clf = sklearn.linear_model.LogisticRegression(random_state=1126)
            clf.fit(self.X_train, self.Y_train[:, i])

            assert_array_equal(clf.predict(self.X_train).astype(int),
                               br_pred_train[:, i])
            assert_array_equal(clf.predict(self.X_test).astype(int),
                               br_pred_test[:, i])

            assert_array_equal(clf.predict_proba(self.X_train)[:, 1].astype(float),
                               br_pred_proba_train[:, i])
            assert_array_equal(clf.predict_proba(self.X_test)[:, 1].astype(float),
                               br_pred_proba_test[:, i])

        self.assertEqual(
            np.mean(np.abs(self.Y_test - br_pred_test).mean(axis=1)),
            br.score(Dataset(self.X_test, self.Y_test), 'hamming'))

        self.assertRaises(NotImplementedError,
                lambda: br.score(Dataset(self.X_test, self.Y_test),
                                 criterion='not_exist'))
Пример #12
0
 def test_eer_01(self):
     ds = Dataset(self.X + self.X_pool,
                  self.y[:3] + [None for _ in range(len(self.X_pool))])
     qs = EER(ds, LogisticRegression(), loss='01', random_state=1126)
     qseq = run_qs(ds, qs, self.y_truth, self.quota)
     assert_array_equal(
         qseq, np.array([105, 16, 131, 117, 109, 148, 136, 115, 144, 121]))
Пример #13
0
 def test_eer(self):
     ds = Dataset(self.X + self.X_pool,
                  self.y[:3] + [None for _ in range(len(self.X_pool))])
     qs = EER(ds, LogisticRegression(), random_state=1126)
     qseq = run_qs(ds, qs, self.y_truth, self.quota)
     assert_array_equal(
         qseq, np.array([131, 20, 129, 78, 22, 139, 88, 43, 141, 133]))
Пример #14
0
    def __init__(self, *args, **kwargs):
        super(MaximumLossReductionMaximalConfidence,
              self).__init__(*args, **kwargs)

        # self.n_labels = len(self.dataset.get_labeled_entries()[0][1])
        self.n_labels = len(self.dataset.get_labeled_entries()[1][0])

        random_state = kwargs.pop('random_state', None)
        self.random_state_ = seed_random_state(random_state)

        self.logreg_param = kwargs.pop(
            'logreg_param', {
                'multi_class': 'multinomial',
                'solver': 'newton-cg',
                'random_state': random_state
            })
        self.logistic_regression_ = LogisticRegression(**self.logreg_param)

        self.br_base = kwargs.pop(
            'br_base',
            SklearnProbaAdapter(
                SVC(kernel='linear',
                    probability=True,
                    gamma="auto",
                    random_state=random_state)))
Пример #15
0
def score_per_add_al(labeled_pool_df, base_training_df, validation_data_df):
    # type: (DataFrame, DataFrame, DataFrame) -> tuple

    gen_pool_df = labeled_pool_df.copy(deep=True)
    gen_pool_df[cn.col_names.tag] = [np.NaN] * len(
        gen_pool_df)  # clear all tags
    enriched_train_df = pd.concat([base_training_df, gen_pool_df],
                                  ignore_index=True)

    extractor = cn.Feature_Extractor(
        enriched_train_df, cn.col_names)  # build the feature extractor

    trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor)

    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())

    ideal_df = pd.concat([base_training_df, labeled_pool_df],
                         ignore_index=True)
    lbr = IdealTextLabeler(TextDataset(ideal_df, cn.col_names, extractor))

    scoring_fun = lambda ds: run_classifier(ds.extract_labeled_dataframe(),
                                            validation_data_df)
    ex_added_list, res_list = run_active_learning(
        trn_ds, scoring_fun, lbr, qs, len(enriched_train_df))  # label all df

    return ex_added_list, res_list
Пример #16
0
    def libact_first_try_second_run(self, enriched_train_df, extractor,
                                    ideal_df, lbr, quota, validation_data_df,
                                    return_dict):

        trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor)
        qs = UncertaintySampling(trn_ds,
                                 method='lc',
                                 model=LogisticRegression())
        E_out1 = []
        E_out1 = np.append(
            E_out1,
            run_classifier(trn_ds.extract_labeled_dataframe(),
                           validation_data_df).f1)
        for i in range(quota):
            if len(trn_ds.get_unlabeled_entries()) == 0:
                break  # finished labeling all examples
            ask_id = qs.make_query()
            lb = lbr.label(trn_ds.extract_sentence(ask_id))
            self.assertEqual(lb, ideal_df[cn.tag_col][ask_id])
            trn_ds.update(ask_id, lb)
            # model.train(trn_ds)
            E_out1 = np.append(
                E_out1,
                run_classifier(trn_ds.extract_labeled_dataframe(),
                               validation_data_df).f1)
        return_dict[2] = E_out1
Пример #17
0
 def test_ActiveLearningByLearning(self):
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)]))
     qs = ActiveLearningByLearning(trn_ds,
                                   T=self.quota,
                                   query_strategies=[
                                       UncertaintySampling(
                                           trn_ds,
                                           model=LogisticRegression()),
                                       HintSVM(trn_ds, random_state=1126)
                                   ],
                                   model=LogisticRegression(),
                                   random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([173, 103, 133, 184, 187, 147, 251, 83, 93, 33]))
Пример #18
0
 def test_cost_sensitive_random_pair_encoding(self):
     trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5))
     model = BinaryRelevance(LogisticRegression(solver='liblinear',
                                                multi_class="ovr"))
     base_model = LogisticRegression(
             solver='liblinear', multi_class="ovr", random_state=1126)
     qs = CostSensitiveReferencePairEncoding(
             trn_ds,
             scoring_fn=pairwise_f1_score,
             model=model,
             base_model=base_model,
             n_models=10,
             n_jobs=1,
             random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(qseq,
             np.array([149, 434, 1126, 719, 983, 564, 816, 732, 101, 1242]))
Пример #19
0
 def test_adaptive_active_learning(self):
     trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5))
     qs = AdaptiveActiveLearning(trn_ds,
             base_clf=LogisticRegression(solver='liblinear', multi_class="ovr"), n_jobs=-1,
                                         random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(qseq,
             np.array([594, 827, 1128, 419, 1223, 484, 96, 833, 37, 367]))
Пример #20
0
 def build_query_strategy(sent_df, col_names):
     # type: (DataFrame, ColumnNames) -> QueryStrategy
     """
     Builds and returns a QueryStrategy
         using a feature extractor and a base_df
     """
     init_extractor = SynStateALHeuristic.build_feature_extractor(sent_df, col_names)
     combined_features = init_extractor.transform(sent_df, col_names)
     return VarianceReduction(TextDataset(sent_df, col_names, None, features=combined_features),
                              model=LogisticRegression(), sigma=0.1)
Пример #21
0
 def test_UcertaintySamplingSm(self):
     random.seed(1126)
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)]))
     qs = UncertaintySampling(trn_ds,
                              method='sm',
                              model=LogisticRegression())
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([145, 66, 82, 37, 194, 60, 191, 211, 245, 131]))
Пример #22
0
 def build_query_strategy(sent_df, col_names):
     # type: (DataFrame, ColumnNames) -> QueryStrategy
     """
     Builds and returns a QueryStrategy
         using a feature extractor and a base_df
     """
     init_extractor = SynStateALHeuristic.build_feature_extractor(sent_df, col_names)
     combined_features = init_extractor.transform(sent_df, col_names)
     return UncertaintySampling(TextDataset(sent_df, col_names, None, features=combined_features),
                                method='lc', model=LogisticRegression())
Пример #23
0
 def test_variance_reduction(self):
     trn_ds = Dataset(self.X,
                      np.concatenate([self.y[:2],
                                      [None] * (len(self.y) - 2)]))
     qs = VarianceReduction(
             trn_ds,
             model=LogisticRegression(solver='liblinear', multi_class="ovr"),
             sigma=0.1
         )
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(qseq, np.array([4, 5, 2, 3]))
Пример #24
0
 def test_density_weighted_meta_uncertainty_lc(self):
     trn_ds = Dataset(self.X[:20], np.concatenate([self.y[:6],
                                                   [None] * 14]))
     base_qs = UncertaintySampling(trn_ds,
                                   method='lc',
                                   model=LogisticRegression(
                                       solver='liblinear',
                                       multi_class="ovr"))
     similarity_metric = cosine_similarity
     clustering_method = KMeans(n_clusters=3, random_state=1126)
     qs = DensityWeightedMeta(dataset=trn_ds,
                              base_query_strategy=base_qs,
                              similarity_metric=similarity_metric,
                              clustering_method=clustering_method,
                              beta=1.0,
                              random_state=1126)
     model = LogisticRegression(solver='liblinear', multi_class="ovr")
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(qseq,
                        np.array([13, 18, 9, 12, 8, 16, 10, 19, 15, 17]))
Пример #25
0
 def test_query_by_committee_vote(self):
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)]))
     qs = QueryByCommittee(trn_ds,
                           disagreement='vote',
                           models=[
                               LogisticRegression(C=1.0,
                                                  solver="liblinear",
                                                  multi_class="ovr"),
                               LogisticRegression(C=0.01,
                                                  solver="liblinear",
                                                  multi_class="ovr"),
                               LogisticRegression(C=100,
                                                  solver="liblinear",
                                                  multi_class="ovr")
                           ],
                           random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([267, 210, 229, 220, 134, 252, 222, 142, 245, 228]))
 def test_multilabel_with_auxiliary_learner_hlr(self):
     trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5))
     qs = MultilabelWithAuxiliaryLearner(
         trn_ds,
         major_learner=BinaryRelevance(LogisticRegression()),
         auxiliary_learner=BinaryRelevance(SVM()),
         criterion='hlr',
         random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq,
         np.array([701, 1403, 147, 897, 974, 1266, 870, 703, 292, 1146]))
Пример #27
0
 def test_multilabel_with_auxiliary_learner_mmr(self):
     trn_ds = Dataset(self.X,
                      self.y[:5] + [None] * (len(self.y) - 5))
     qs = MultilabelWithAuxiliaryLearner(trn_ds,
             major_learner=BinaryRelevance(LogisticRegression(solver='liblinear',
                                                              multi_class="ovr")),
             auxiliary_learner=BinaryRelevance(SVM(gamma="auto")),
             criterion='mmr',
             random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(qseq,
             np.array([1258, 1461, 231, 1198, 1498, 1374, 955, 1367, 265, 144]))
Пример #28
0
    def test_logistic_regression(self):
        clf = sklearn.linear_model.LogisticRegression()
        clf.fit(self.X_train, self.y_train)
        lr = LogisticRegression()
        lr.train(Dataset(self.X_train, self.y_train))

        assert_array_equal(clf.predict(self.X_train), lr.predict(self.X_train))
        assert_array_equal(clf.predict(self.X_test), lr.predict(self.X_test))
        self.assertEqual(clf.score(self.X_train, self.y_train),
                         lr.score(Dataset(self.X_train, self.y_train)))
        self.assertEqual(clf.score(self.X_test, self.y_test),
                         lr.score(Dataset(self.X_test, self.y_test)))
Пример #29
0
    def libact_first_try_first_run(self, enriched_train_df, extractor, lbr,
                                   quota, validation_data_df, return_dict):

        trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor)
        qs = UncertaintySampling(trn_ds,
                                 method='lc',
                                 model=LogisticRegression())
        scoring_fun = lambda ds: run_classifier(ds.extract_labeled_dataframe(),
                                                validation_data_df).f1
        query_num, E_out1 = run_active_learning(trn_ds, scoring_fun, lbr, qs,
                                                quota)
        return_dict[1] = E_out1
Пример #30
0
 def test_multilabel_with_auxiliary_learner_shlr(self):
     trn_ds = Dataset(self.X,
                      self.y[:5] + [None] * (len(self.y) - 5))
     qs = MultilabelWithAuxiliaryLearner(trn_ds,
             major_learner=BinaryRelevance(LogisticRegression(solver='liblinear',
                                                              multi_class="ovr")),
             auxiliary_learner=BinaryRelevance(SVM(gamma="auto")),
             criterion='shlr',
             b=1.,
             random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(qseq,
             np.array([1258, 805, 459, 550, 783, 964, 736, 1004, 38, 750]))