def test_uncertainty_entropy(self): trn_ds = init_toyexample(self.X, self.y) qs = UncertaintySampling(trn_ds, method='entropy', model=LogisticRegression()) model = LogisticRegression() qseq = run_qs(trn_ds, self.lbr, model, qs, self.quota) assert_array_equal(qseq, np.array([6, 7, 8, 9]))
def test_uncertainty_sm(self): trn_ds = init_toyexample(self.X, self.y) qs = UncertaintySampling(trn_ds, method='sm', model=LogisticRegression(solver='liblinear', multi_class="ovr")) model = LogisticRegression(solver='liblinear', multi_class="ovr") qseq = run_qs(trn_ds, self.lbr, model, qs, self.quota) assert_array_equal(qseq, np.array([6, 7, 8, 9]))
def test_binary_relevance_parallel(self): br = BinaryRelevance(base_clf=LogisticRegression(random_state=1126), n_jobs=1) br.train(Dataset(self.X_train, self.Y_train)) br_par = BinaryRelevance( base_clf=LogisticRegression(random_state=1126), n_jobs=2) br_par.train(Dataset(self.X_train, self.Y_train)) assert_array_equal(br.predict(self.X_test).astype(int), br_par.predict(self.X_test).astype(int))
def main(): # Specifiy the parameters here: # path to your binary classification dataset dataset_filepath = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'diabetes.txt') test_size = 0.33 # the percentage of samples in the dataset that will be # randomly selected and assigned to the test set n_labeled = 10 # number of samples that are initially labeled # Load dataset trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \ split_train_test(dataset_filepath, test_size, n_labeled) trn_ds2 = copy.deepcopy(trn_ds) trn_ds3 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) quota = len(y_train) - n_labeled # number of samples to query batch_size = 5 # Comparing UncertaintySampling strategy with RandomSampling. # model is the base learner, e.g. LogisticRegression, SVM ... etc. # qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression(), n=batch_size) qs = US_dev(trn_ds, method='lc', model=LogisticRegression()) model = LogisticRegression() E_in_1, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota, batch_size) # qs2 = RandomSampling(trn_ds2, n=batch_size) qs2 = RS_dev(trn_ds2) model = LogisticRegression() E_in_2, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota, batch_size) qs3 = KCenterGreedy(trn_ds3, transformer=None) model = LogisticRegression() E_in_3, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota, batch_size) # Plot the learning curve of UncertaintySampling to RandomSampling # The x-axis is the number of queries, and the y-axis is the corresponding # error rate. assert len(E_in_1) == len(E_in_2) query_num = np.arange(1, len(E_in_1) + 1) plt.plot(query_num, E_in_1, 'lightcoral', label='qs Ein') plt.plot(query_num, E_in_2, 'lightgreen', label='random Ein') plt.plot(query_num, E_in_3, 'lightsteelblue', label='k-center-greedy Ein') plt.plot(query_num, E_out_1, 'r', label='qs Eout') plt.plot(query_num, E_out_2, 'g', label='random Eout') plt.plot(query_num, E_out_3, 'b', label='k-center-greedy Eout') plt.xlabel('Number of Queries') plt.ylabel('Error') plt.title('Experiment Result') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.show()
def test_QueryByCommittee(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = QueryByCommittee(trn_ds, models=[ LogisticRegression(C=1.0), LogisticRegression(C=0.01), LogisticRegression(C=100) ], random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([267, 210, 229, 220, 134, 252, 222, 142, 245, 228]))
def test_QueryByCommittee(self): #import ipdb; ipdb.set_trace() random.seed(1126) trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = QueryByCommittee(trn_ds, models=[ LogisticRegression(C=1.0), LogisticRegression(C=0.01), LogisticRegression(C=100) ]) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([11, 207, 101, 30, 116, 108, 83, 172, 211, 42]))
def test_query_by_committee_kl_divergence(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = QueryByCommittee(trn_ds, disagreement='kl_divergence', models=[ LogisticRegression(C=1.0), LogisticRegression(C=0.01), LogisticRegression(C=100) ], random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([228, 111, 162, 243, 213, 122, 110, 108, 156, 37]))
def test_query_by_committee_vote(self): #self.skipTest("In this version we randomize make queries") trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = QueryByCommittee(trn_ds, disagreement='vote', models=[ LogisticRegression(C=1.0), LogisticRegression(C=0.01), LogisticRegression(C=100) ], random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal(qseq, np.array([10, 12, 11, 13, 16, 14, 17, 18, 19, 21]))
def test_binary_minimization(self): trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5)) qs = BinaryMinimization(trn_ds, LogisticRegression(solver='liblinear', multi_class="ovr"), random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal(qseq, np.array([936, 924, 1211, 1286, 590, 429, 404, 962, 825, 30]))
def test_ActiveLearningByLearning(self): np.random.seed(1126) trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = ActiveLearningByLearning(trn_ds, T=self.quota, query_strategies=[ UncertaintySampling( trn_ds, model=LogisticRegression()), HintSVM(trn_ds) ], model=LogisticRegression()) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([103, 220, 118, 75, 176, 50, 247, 199, 46, 55]))
def test_binary_relevance_lr(self): br = BinaryRelevance(base_clf=LogisticRegression(random_state=1126)) br.train(Dataset(self.X_train, self.Y_train)) br_pred_train = br.predict(self.X_train).astype(int) br_pred_test = br.predict(self.X_test).astype(int) br_pred_proba_train = br.predict_proba(self.X_train).astype(float) br_pred_proba_test = br.predict_proba(self.X_test).astype(float) for i in range(np.shape(self.Y_train)[1]): clf = sklearn.linear_model.LogisticRegression(random_state=1126) clf.fit(self.X_train, self.Y_train[:, i]) assert_array_equal(clf.predict(self.X_train).astype(int), br_pred_train[:, i]) assert_array_equal(clf.predict(self.X_test).astype(int), br_pred_test[:, i]) assert_array_equal(clf.predict_proba(self.X_train)[:, 1].astype(float), br_pred_proba_train[:, i]) assert_array_equal(clf.predict_proba(self.X_test)[:, 1].astype(float), br_pred_proba_test[:, i]) self.assertEqual( np.mean(np.abs(self.Y_test - br_pred_test).mean(axis=1)), br.score(Dataset(self.X_test, self.Y_test), 'hamming')) self.assertRaises(NotImplementedError, lambda: br.score(Dataset(self.X_test, self.Y_test), criterion='not_exist'))
def test_eer_01(self): ds = Dataset(self.X + self.X_pool, self.y[:3] + [None for _ in range(len(self.X_pool))]) qs = EER(ds, LogisticRegression(), loss='01', random_state=1126) qseq = run_qs(ds, qs, self.y_truth, self.quota) assert_array_equal( qseq, np.array([105, 16, 131, 117, 109, 148, 136, 115, 144, 121]))
def test_eer(self): ds = Dataset(self.X + self.X_pool, self.y[:3] + [None for _ in range(len(self.X_pool))]) qs = EER(ds, LogisticRegression(), random_state=1126) qseq = run_qs(ds, qs, self.y_truth, self.quota) assert_array_equal( qseq, np.array([131, 20, 129, 78, 22, 139, 88, 43, 141, 133]))
def __init__(self, *args, **kwargs): super(MaximumLossReductionMaximalConfidence, self).__init__(*args, **kwargs) # self.n_labels = len(self.dataset.get_labeled_entries()[0][1]) self.n_labels = len(self.dataset.get_labeled_entries()[1][0]) random_state = kwargs.pop('random_state', None) self.random_state_ = seed_random_state(random_state) self.logreg_param = kwargs.pop( 'logreg_param', { 'multi_class': 'multinomial', 'solver': 'newton-cg', 'random_state': random_state }) self.logistic_regression_ = LogisticRegression(**self.logreg_param) self.br_base = kwargs.pop( 'br_base', SklearnProbaAdapter( SVC(kernel='linear', probability=True, gamma="auto", random_state=random_state)))
def score_per_add_al(labeled_pool_df, base_training_df, validation_data_df): # type: (DataFrame, DataFrame, DataFrame) -> tuple gen_pool_df = labeled_pool_df.copy(deep=True) gen_pool_df[cn.col_names.tag] = [np.NaN] * len( gen_pool_df) # clear all tags enriched_train_df = pd.concat([base_training_df, gen_pool_df], ignore_index=True) extractor = cn.Feature_Extractor( enriched_train_df, cn.col_names) # build the feature extractor trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor) qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression()) ideal_df = pd.concat([base_training_df, labeled_pool_df], ignore_index=True) lbr = IdealTextLabeler(TextDataset(ideal_df, cn.col_names, extractor)) scoring_fun = lambda ds: run_classifier(ds.extract_labeled_dataframe(), validation_data_df) ex_added_list, res_list = run_active_learning( trn_ds, scoring_fun, lbr, qs, len(enriched_train_df)) # label all df return ex_added_list, res_list
def libact_first_try_second_run(self, enriched_train_df, extractor, ideal_df, lbr, quota, validation_data_df, return_dict): trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor) qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression()) E_out1 = [] E_out1 = np.append( E_out1, run_classifier(trn_ds.extract_labeled_dataframe(), validation_data_df).f1) for i in range(quota): if len(trn_ds.get_unlabeled_entries()) == 0: break # finished labeling all examples ask_id = qs.make_query() lb = lbr.label(trn_ds.extract_sentence(ask_id)) self.assertEqual(lb, ideal_df[cn.tag_col][ask_id]) trn_ds.update(ask_id, lb) # model.train(trn_ds) E_out1 = np.append( E_out1, run_classifier(trn_ds.extract_labeled_dataframe(), validation_data_df).f1) return_dict[2] = E_out1
def test_ActiveLearningByLearning(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = ActiveLearningByLearning(trn_ds, T=self.quota, query_strategies=[ UncertaintySampling( trn_ds, model=LogisticRegression()), HintSVM(trn_ds, random_state=1126) ], model=LogisticRegression(), random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([173, 103, 133, 184, 187, 147, 251, 83, 93, 33]))
def test_cost_sensitive_random_pair_encoding(self): trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5)) model = BinaryRelevance(LogisticRegression(solver='liblinear', multi_class="ovr")) base_model = LogisticRegression( solver='liblinear', multi_class="ovr", random_state=1126) qs = CostSensitiveReferencePairEncoding( trn_ds, scoring_fn=pairwise_f1_score, model=model, base_model=base_model, n_models=10, n_jobs=1, random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal(qseq, np.array([149, 434, 1126, 719, 983, 564, 816, 732, 101, 1242]))
def test_adaptive_active_learning(self): trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5)) qs = AdaptiveActiveLearning(trn_ds, base_clf=LogisticRegression(solver='liblinear', multi_class="ovr"), n_jobs=-1, random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal(qseq, np.array([594, 827, 1128, 419, 1223, 484, 96, 833, 37, 367]))
def build_query_strategy(sent_df, col_names): # type: (DataFrame, ColumnNames) -> QueryStrategy """ Builds and returns a QueryStrategy using a feature extractor and a base_df """ init_extractor = SynStateALHeuristic.build_feature_extractor(sent_df, col_names) combined_features = init_extractor.transform(sent_df, col_names) return VarianceReduction(TextDataset(sent_df, col_names, None, features=combined_features), model=LogisticRegression(), sigma=0.1)
def test_UcertaintySamplingSm(self): random.seed(1126) trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = UncertaintySampling(trn_ds, method='sm', model=LogisticRegression()) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([145, 66, 82, 37, 194, 60, 191, 211, 245, 131]))
def build_query_strategy(sent_df, col_names): # type: (DataFrame, ColumnNames) -> QueryStrategy """ Builds and returns a QueryStrategy using a feature extractor and a base_df """ init_extractor = SynStateALHeuristic.build_feature_extractor(sent_df, col_names) combined_features = init_extractor.transform(sent_df, col_names) return UncertaintySampling(TextDataset(sent_df, col_names, None, features=combined_features), method='lc', model=LogisticRegression())
def test_variance_reduction(self): trn_ds = Dataset(self.X, np.concatenate([self.y[:2], [None] * (len(self.y) - 2)])) qs = VarianceReduction( trn_ds, model=LogisticRegression(solver='liblinear', multi_class="ovr"), sigma=0.1 ) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal(qseq, np.array([4, 5, 2, 3]))
def test_density_weighted_meta_uncertainty_lc(self): trn_ds = Dataset(self.X[:20], np.concatenate([self.y[:6], [None] * 14])) base_qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression( solver='liblinear', multi_class="ovr")) similarity_metric = cosine_similarity clustering_method = KMeans(n_clusters=3, random_state=1126) qs = DensityWeightedMeta(dataset=trn_ds, base_query_strategy=base_qs, similarity_metric=similarity_metric, clustering_method=clustering_method, beta=1.0, random_state=1126) model = LogisticRegression(solver='liblinear', multi_class="ovr") qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal(qseq, np.array([13, 18, 9, 12, 8, 16, 10, 19, 15, 17]))
def test_query_by_committee_vote(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = QueryByCommittee(trn_ds, disagreement='vote', models=[ LogisticRegression(C=1.0, solver="liblinear", multi_class="ovr"), LogisticRegression(C=0.01, solver="liblinear", multi_class="ovr"), LogisticRegression(C=100, solver="liblinear", multi_class="ovr") ], random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([267, 210, 229, 220, 134, 252, 222, 142, 245, 228]))
def test_multilabel_with_auxiliary_learner_hlr(self): trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5)) qs = MultilabelWithAuxiliaryLearner( trn_ds, major_learner=BinaryRelevance(LogisticRegression()), auxiliary_learner=BinaryRelevance(SVM()), criterion='hlr', random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([701, 1403, 147, 897, 974, 1266, 870, 703, 292, 1146]))
def test_multilabel_with_auxiliary_learner_mmr(self): trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5)) qs = MultilabelWithAuxiliaryLearner(trn_ds, major_learner=BinaryRelevance(LogisticRegression(solver='liblinear', multi_class="ovr")), auxiliary_learner=BinaryRelevance(SVM(gamma="auto")), criterion='mmr', random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal(qseq, np.array([1258, 1461, 231, 1198, 1498, 1374, 955, 1367, 265, 144]))
def test_logistic_regression(self): clf = sklearn.linear_model.LogisticRegression() clf.fit(self.X_train, self.y_train) lr = LogisticRegression() lr.train(Dataset(self.X_train, self.y_train)) assert_array_equal(clf.predict(self.X_train), lr.predict(self.X_train)) assert_array_equal(clf.predict(self.X_test), lr.predict(self.X_test)) self.assertEqual(clf.score(self.X_train, self.y_train), lr.score(Dataset(self.X_train, self.y_train))) self.assertEqual(clf.score(self.X_test, self.y_test), lr.score(Dataset(self.X_test, self.y_test)))
def libact_first_try_first_run(self, enriched_train_df, extractor, lbr, quota, validation_data_df, return_dict): trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor) qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression()) scoring_fun = lambda ds: run_classifier(ds.extract_labeled_dataframe(), validation_data_df).f1 query_num, E_out1 = run_active_learning(trn_ds, scoring_fun, lbr, qs, quota) return_dict[1] = E_out1
def test_multilabel_with_auxiliary_learner_shlr(self): trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5)) qs = MultilabelWithAuxiliaryLearner(trn_ds, major_learner=BinaryRelevance(LogisticRegression(solver='liblinear', multi_class="ovr")), auxiliary_learner=BinaryRelevance(SVM(gamma="auto")), criterion='shlr', b=1., random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal(qseq, np.array([1258, 805, 459, 550, 783, 964, 736, 1004, 38, 750]))