def libact_QBC(X, y, n_queries):
    y_train = np.array([None for _ in range(len(y))])
    y_train[0], y_train[50], y_train[100] = 0, 1, 2
    libact_train_dataset = Dataset(X, y_train)
    libact_full_dataset = Dataset(X, y)
    libact_learner_list = [
        LogisticRegressionLibact(solver='liblinear',
                                 n_jobs=1,
                                 multi_class='ovr'),
        LogisticRegressionLibact(solver='liblinear',
                                 n_jobs=1,
                                 multi_class='ovr')
    ]
    libact_qs = QueryByCommittee(libact_train_dataset,
                                 models=libact_learner_list,
                                 method='lc')
    libact_labeler = IdealLabeler(libact_full_dataset)
    for libact_learner in libact_learner_list:
        libact_learner.train(libact_train_dataset)

    for _ in range(n_queries):
        query_idx = libact_qs.make_query()
        query_label = libact_labeler.label(X[query_idx])
        libact_train_dataset.update(query_idx, query_label)
        for libact_learner in libact_learner_list:
            libact_learner.train(libact_train_dataset)
示例#2
0
 def test_QueryByCommittee(self):
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)]))
     qs = QueryByCommittee(trn_ds,
                           models=[
                               LogisticRegression(C=1.0),
                               LogisticRegression(C=0.01),
                               LogisticRegression(C=100)
                           ],
                           random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([267, 210, 229, 220, 134, 252, 222, 142, 245, 228]))
示例#3
0
 def test_query_by_committee_kl_divergence(self):
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)]))
     qs = QueryByCommittee(trn_ds,
                           disagreement='kl_divergence',
                           models=[
                               LogisticRegression(C=1.0),
                               LogisticRegression(C=0.01),
                               LogisticRegression(C=100)
                           ],
                           random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([228, 111, 162, 243, 213, 122, 110, 108, 156, 37]))
示例#4
0
 def test_query_by_committee_vote(self):
     #self.skipTest("In this version we randomize make queries")
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)]))
     qs = QueryByCommittee(trn_ds,
                           disagreement='vote',
                           models=[
                               LogisticRegression(C=1.0),
                               LogisticRegression(C=0.01),
                               LogisticRegression(C=100)
                           ],
                           random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(qseq,
                        np.array([10, 12, 11, 13, 16, 14, 17, 18, 19, 21]))
示例#5
0
 def test_query_by_committee_vote(self):
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)]))
     qs = QueryByCommittee(trn_ds,
                           disagreement='vote',
                           models=[
                               LogisticRegression(C=1.0,
                                                  solver="liblinear",
                                                  multi_class="ovr"),
                               LogisticRegression(C=0.01,
                                                  solver="liblinear",
                                                  multi_class="ovr"),
                               LogisticRegression(C=100,
                                                  solver="liblinear",
                                                  multi_class="ovr")
                           ],
                           random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([267, 210, 229, 220, 134, 252, 222, 142, 245, 228]))
示例#6
0
def initialQuerySetup(train_dataset,
                      queryStrategyID,
                      queryParams=None,
                      fixRandomState=False):

    if queryStrategyID == 0:
        queryStrategy = RandomSampling(train_dataset,random_state=137 \
                                       if fixRandomState else None)

    elif queryStrategyID == 1:
        queryStrategy = UncertaintySampling(train_dataset,
                                            method='sm',
                                            model=queryParams[0])

    elif queryStrategyID == 2:
        queryStrategy = QueryByCommittee(train_dataset,
                                         models=queryParams[0],
                                         disagreement='vote',
                                         random_state=23 \
                                         if fixRandomState else None)
    elif queryStrategyID == 3:
        queryStrategy = RandomBatchQuery(train_dataset,
                                         batch_size=queryParams[0],
                                         random_state=2311 \
                                         if fixRandomState else None)

    elif queryStrategyID == 4:
        queryStrategy = LeastCertainBatchQuery(train_dataset,
                                               model=queryParams[0],
                                               batch_size=queryParams[1],
                                               random_state=2317 \
                                               if fixRandomState else None)

    elif queryStrategyID == 5:
        queryStrategy = SemiSupervisedBatchQuery(train_dataset,
                                                 model=queryParams[0],
                                                 batch_size=queryParams[1],
                                                 random_state=3112 \
                                                 if fixRandomState else None)

    return queryStrategy
示例#7
0
def main():
    global pos_filepath, dataset_filepath, csv_filepath, vectors_list, ids_list
    dataset_filepath = "/Users/dndesign/Desktop/active_learning/vecteurs_et_infos/vectors_2015.txt"
    csv_filepath = "/Users/dndesign/Desktop/active_learning/donnees/corpus_2015_id-time-text.csv"
    pos_filepath = "/Users/dndesign/Desktop/active_learning/donnees/oriane_pos_id-time-text.csv"
    vectors_list, ids_list = get_vectors_list(dataset_filepath)

    timestr = time.strftime("%Y%m%d_%H%M%S")
    text_file = codecs.open("task_" + str(timestr) + ".txt", "w", "utf-8")

    print("Loading data...")
    text_file.write("Loading data...\n")
    # Open this file
    t0 = time.time()
    file = openfile_txt(dataset_filepath)
    num_lines = sum(1 for line in file)
    print("Treating " + str(num_lines) + " entries...")
    text_file.write("Treating : %s entries...\n" % str(num_lines))

    # Number of queries to ask human to label
    quota = 10
    E_out1, E_out2, E_out3, E_out4, E_out6, E_out7 = [], [], [], [], [], []
    trn_ds, tst_ds = split_train_test(csv_filepath)

    model = SVM(kernel='linear')
    # model = LogisticRegression()

    ''' UncertaintySampling (Least Confident)
     
        UncertaintySampling : it queries the instances about which 
        it is least certain how to label
        
        Least Confident : it queries the instance whose posterior 
        probability of being positive is nearest 0.5
    '''
    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression(C=.01))
    model.train(trn_ds)
    E_out1 = np.append(E_out1, 1 - model.score(tst_ds))

    ''' UncertaintySampling (Max Margin) 

    '''
    trn_ds2 = copy.deepcopy(trn_ds)
    qs2 = USampling(trn_ds2, method='mm', model=SVM(kernel='linear'))
    model.train(trn_ds2)
    E_out2 = np.append(E_out2, 1 - model.score(tst_ds))

    ''' CMB Sampling   
        Combination of active learning algorithms (distance-based (DIST), diversity-based (DIV)) 
    '''
    trn_ds3 = copy.deepcopy(trn_ds)
    qs3 = CMBSampling(trn_ds3, model=SVM(kernel='linear'))
    model.train(trn_ds3)
    E_out3 = np.append(E_out3, 1 - model.score(tst_ds))

    ''' Random Sampling   
        Random : it chooses randomly a query
    '''
    trn_ds4 = copy.deepcopy(trn_ds)
    qs4 = RandomSampling(trn_ds4, random_state=1126)
    model.train(trn_ds4)
    E_out4 = np.append(E_out4, 1 - model.score(tst_ds))

    ''' QueryByCommittee (Vote Entropy)
    
        QueryByCommittee : it keeps a committee of classifiers and queries 
        the instance that the committee members disagree, it  also examines 
        unlabeled examples and selects only those that are most informative 
        for labeling
        
        Vote Entropy : a way of measuring disagreement 
        
        Disadvantage : it does not consider the committee members’ class 
        distributions. It also misses some informative unlabeled examples 
        to label 
    '''
    trn_ds6 = copy.deepcopy(trn_ds)
    qs6 = QueryByCommittee(trn_ds6, disagreement='vote',
                              models=[LogisticRegression(C=1.0),
                                      LogisticRegression(C=0.01),
                                      LogisticRegression(C=100)],
                              random_state=1126)
    model.train(trn_ds6)
    E_out6 = np.append(E_out6, 1 - model.score(tst_ds))

    ''' QueryByCommittee (Kullback-Leibler Divergence)
    
            QueryByCommittee : it examines unlabeled examples and selects only 
            those that are most informative for labeling
            
            Disadvantage :  it misses some examples on which committee members 
            disagree
    '''
    trn_ds7 = copy.deepcopy(trn_ds)
    qs7 = QueryByCommittee(trn_ds7, disagreement='kl_divergence',
                                  models=[LogisticRegression(C=1.0),
                                          LogisticRegression(C=0.01),
                                          LogisticRegression(C=100)],
                                  random_state=1126)
    model.train(trn_ds7)
    E_out7 = np.append(E_out7, 1 - model.score(tst_ds))

    with sns.axes_style("darkgrid"):
        fig = plt.figure()
        ax = fig.add_subplot(1, 1, 1)

    query_num = np.arange(0, 1)
    p1, = ax.plot(query_num, E_out1, 'red')
    p2, = ax.plot(query_num, E_out2, 'blue')
    p3, = ax.plot(query_num, E_out3, 'green')
    p4, = ax.plot(query_num, E_out4, 'orange')
    p6, = ax.plot(query_num, E_out6, 'black')
    p7, = ax.plot(query_num, E_out7, 'purple')
    plt.legend(('Least Confident', 'Max Margin', 'Distance Diversity CMB', 'Random Sampling', 'Vote Entropy', 'KL Divergence'), loc=1)
    plt.ylabel('Accuracy')
    plt.xlabel('Number of Queries')
    plt.title('Active Learning - Query choice strategies')
    plt.ylim([0, 1])
    plt.show(block=False)

    for i in range(quota):
        print("\n#################################################")
        print("Query number " + str(i) + " : ")
        print("#################################################\n")
        text_file.write("\n#################################################\n")
        text_file.write("Query number %s : " % str(i))
        text_file.write("\n#################################################\n")

        ask_id = qs.make_query()
        print("\033[4mUsing Uncertainty Sampling (Least confident) :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using Uncertainty Sampling (Least confident) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds)
        E_out1 = np.append(E_out1, 1 - model.score(tst_ds))

        ask_id = qs2.make_query()
        print("\033[4mUsing Uncertainty Sampling (Max Margin) :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using Uncertainty Sampling (Smallest Margin) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds2.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds2)
        E_out2 = np.append(E_out2, 1 - model.score(tst_ds))

        ask_id = qs3.make_query()
        print("\033[4mUsing CMB Distance-Diversity Sampling :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using Uncertainty Sampling (Entropy) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds3.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds3)
        E_out3 = np.append(E_out3, 1 - model.score(tst_ds))

        ask_id = qs4.make_query()
        print("\033[4mUsing Random Sampling :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using Random Sampling :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds4.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds4)
        E_out4 = np.append(E_out4, 1 - model.score(tst_ds))

        ask_id = qs6.make_query()
        print("\033[4mUsing QueryByCommittee (Vote Entropy) :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using QueryByCommittee (Vote Entropy) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds6.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds6)
        E_out6 = np.append(E_out6, 1 - model.score(tst_ds))

        ask_id = qs7.make_query()
        print("\033[4mUsing QueryByCommittee (KL Divergence) :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using QueryByCommittee (KL Divergence) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds7.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds7)
        E_out7 = np.append(E_out7, 1 - model.score(tst_ds))

        ax.set_xlim((0, i + 1))
        ax.set_ylim((0, max(max(E_out1), max(E_out2), max(E_out3), max(E_out4), max(E_out6), max(E_out7)) + 0.2))
        query_num = np.arange(0, i + 2)
        p1.set_xdata(query_num)
        p1.set_ydata(E_out1)
        p2.set_xdata(query_num)
        p2.set_ydata(E_out2)
        p3.set_xdata(query_num)
        p3.set_ydata(E_out3)
        p4.set_xdata(query_num)
        p4.set_ydata(E_out4)
        p6.set_xdata(query_num)
        p6.set_ydata(E_out6)
        p7.set_xdata(query_num)
        p7.set_ydata(E_out7)

        plt.draw()

    t2 = time.time()
    time_total = t2 - t0
    print("\n\n\n#################################################\n")
    print("Execution time : %fs \n\n" % time_total)
    text_file.write("\n\n\n#################################################\n")
    text_file.write("Execution time : %fs \n" % time_total)
    text_file.close()
    input("Press any key to save the plot...")
    plt.savefig('task_' + str(timestr) + '.png')

    print("Done")
示例#8
0
def getQueryStrategy(query_strategy,
                     train_ds,
                     disagreement,
                     estimator_name=None):
    print('Initialize Query Strategy')
    # no committee but baseline query strategy
    if query_strategy == 'uncertainty':
        qs = UncertaintySampling(train_ds,
                                 method='lc',
                                 model=la.LogisticRegression_())
    # no committee but baseline query strategy
    elif query_strategy == 'random':
        qs = RandomSampling(train_ds)
    elif query_strategy == 'lr_lsvc_rf_dt':
        if disagreement == 'kl_divergence':
            raise ValueError(
                'when using kl_divergence lsvc cannot be in the committee as linearSVC does not provide predict_proba().\
                             Use svc instead or change disagreement to vote!')
        qs = QueryByCommittee(train_ds,
                              models=[
                                  la.RandomForest_(),
                                  la.DecisionTree_(),
                                  la.LogisticRegression_(solver='liblinear',
                                                         max_iter=1000),
                                  la.LinearSVC_()
                              ],
                              disagreement=disagreement)
    # committee with probabilistic models (SVC with prob=True used here instead of LinearSVC)
    elif query_strategy == 'lr_svc_rf_dt':
        qs = QueryByCommittee(train_ds,
                              models=[
                                  la.RandomForest_(),
                                  la.DecisionTree_(),
                                  la.LogisticRegression_(solver='liblinear',
                                                         max_iter=1000),
                                  la.SVC_(kernel='linear', probability=True)
                              ],
                              disagreement=disagreement)
    elif query_strategy == 'lr_svc_dt_xgb':
        qs = QueryByCommittee(
            train_ds,
            models=[
                la.LogisticRegression_(solver='liblinear', max_iter=1000),
                la.SVC_(kernel='linear', probability=True),
                la.DecisionTree_(),
                la.XGBClassifier_(objective="binary:logistic")
            ],
            disagreement=disagreement)
    # committee of five
    elif query_strategy == 'lr_svc_dt_xgb_rf':
        qs = QueryByCommittee(
            train_ds,
            models=[
                la.LogisticRegression_(solver='liblinear', max_iter=1000),
                la.SVC_(kernel='linear', probability=True),
                la.DecisionTree_(),
                la.XGBClassifier_(objective="binary:logistic"),
                la.RandomForest_()
            ],
            disagreement=disagreement)
    elif query_strategy == 'lr_lsvc_dt_gpc':
        if disagreement == 'kl_divergence':
            raise ValueError(
                'when using kl_divergence lsvc cannot be in the committee as linearSVC does not provide predict_proba().\
                             Use svc instead or change disagreement to vote!')
        qs = QueryByCommittee(train_ds,
                              models=[
                                  la.LogisticRegression_(solver='liblinear',
                                                         max_iter=1000),
                                  la.LinearSVC_(),
                                  la.DecisionTree_(),
                                  la.GaussianProcess_()
                              ],
                              disagreement=disagreement)
    elif query_strategy == 'lr_lsvc_dt_xgb':
        if disagreement == 'kl_divergence':
            raise ValueError(
                'when using kl_divergence lsvc cannot be in the committee as linearSVC does not provide predict_proba().\
                             Use svc instead or change disagreement to vote!')
        qs = QueryByCommittee(
            train_ds,
            models=[
                la.LogisticRegression_(solver='liblinear', max_iter=1000),
                la.LinearSVC_(),
                la.DecisionTree_(),
                la.XGBClassifier_(objective="binary:logistic")
            ],
            disagreement=disagreement)
    elif query_strategy == 'homogeneous_committee':
        committee = CommitteeModels(estimator_name)
        qs = QueryByCommittee(train_ds, models=committee.committee['models'])
    else:
        print("Query strategy not defined!")
        return None
    return qs