Пример #1
0
 def build_query_strategy(sent_df, col_names):
     # type: (DataFrame, ColumnNames) -> QueryStrategy
     """
     Builds and returns a QueryStrategy
         using a feature extractor and a base_df
     """
     init_extractor = SynStateALHeuristic.build_feature_extractor(
         sent_df, col_names)
     combined_features = init_extractor.transform(sent_df, col_names)
     trn_ds = TextDataset(sent_df,
                          col_names,
                          None,
                          features=combined_features)
     return ActiveLearningByLearning(
         trn_ds,
         query_strategies=[
             UncertaintySampling(trn_ds,
                                 model=SVM(C=100,
                                           gamma=3.1,
                                           kernel='rbf',
                                           decision_function_shape='ovr')),
             QUIRE(trn_ds),
             HintSVM(trn_ds, cl=1.0, ch=1.0),
         ],
         T=1000,
         uniform_sampler=True,
         model=SVM(C=100,
                   gamma=3.1,
                   kernel='rbf',
                   decision_function_shape='ovr'))
Пример #2
0
def train_for_user(user_id=None, device_type=None, n_class=None):
    test_data = waterloo_iv_processing.get_per_user_data(
        user_id=user_id,
        device=device_type,
        video_name=['sports', 'document', 'nature', 'game', 'movie'])
    X, y = processing_training_data(n_class=n_class, train_data=test_data)
    test_size = 0.2  # the percentage of samples in the dataset that will be
    quota = 350  # number of samples to query

    result = {'E1': [], 'E2': [], 'E3': []}
    for i in range(20):
        print('exp:', i)
        trn_ds, tst_ds, fully_labeled_trn_ds, cost_matrix = split_train_test(
            X=X, y=y, test_size=test_size, n_class=n_class)
        trn_ds2 = copy.deepcopy(trn_ds)
        trn_ds3 = copy.deepcopy(trn_ds)
        lbr = IdealLabeler(fully_labeled_trn_ds)
        model = SVM(kernel='rbf', decision_function_shape='ovr')

        qs = UncertaintySampling(trn_ds,
                                 method='sm',
                                 model=SVM(decision_function_shape='ovr'))
        _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota, cost_matrix)
        result['E1'].append(E_out_1)

        qs2 = RandomSampling(trn_ds2)
        _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota, cost_matrix)
        result['E2'].append(E_out_2)

        qs3 = ALCE(trn_ds3, cost_matrix, SVR())
        _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota, cost_matrix)
        result['E3'].append(E_out_3)

    E_out_1 = np.mean(result['E1'], axis=0)
    E_out_2 = np.mean(result['E2'], axis=0)
    E_out_3 = np.mean(result['E3'], axis=0)

    save_file(
        'results/' + device_type + '_user_' + str(user_id) + '_E1_class_' +
        str(n_class) + '.txt', result['E1'])
    save_file(
        'results/' + device_type + '_user_' + str(user_id) + '_E2_class_' +
        str(n_class) + '.txt', result['E2'])
    save_file(
        'results/' + device_type + '_user_' + str(user_id) + '_E3_class_' +
        str(n_class) + '.txt', result['E3'])

    print("Uncertainty: ", E_out_1[::5].tolist())
    print("Random: ", E_out_2[::5].tolist())
    print("ALCE: ", E_out_3[::5].tolist())

    query_num = np.arange(0, quota + 1)
    uncert, = plt.plot(query_num, E_out_1, 'g', label='Uncertainty sampling')
    rd, = plt.plot(query_num, E_out_2, 'k', label='Random')
    alce, = plt.plot(query_num, E_out_3, 'r', label='ALCE')
    plt.xlabel('Number of Queries')
    plt.ylabel('Error')
    plt.title('Experiment Result (user ' + str(user_id) + ')')
    plt.legend(handles=[uncert, rd, alce], loc=3)
    plt.show()
Пример #3
0
def main():
    test_size = 0.25  # the percentage of samples in the dataset that will be
    # randomly selected and assigned to the test set

    result = {'E1': [], 'E2': [], 'E3': []}
    for i in range(2):
        trn_ds, tst_ds, fully_labeled_trn_ds, cost_matrix = \
            split_train_test(test_size)
        trn_ds2 = copy.deepcopy(trn_ds)
        trn_ds3 = copy.deepcopy(trn_ds)
        lbr = IdealLabeler(fully_labeled_trn_ds)
        model = SVM(kernel='rbf', decision_function_shape='ovr')

        quota = 100  # number of samples to query

        qs = UncertaintySampling(trn_ds,
                                 method='sm',
                                 model=SVM(decision_function_shape='ovr'))
        _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota, cost_matrix)
        result['E1'].append(E_out_1)

        qs2 = RandomSampling(trn_ds2)
        _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota, cost_matrix)
        result['E2'].append(E_out_2)

        qs3 = ALCE(trn_ds3, cost_matrix, SVR())
        _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota, cost_matrix)
        result['E3'].append(E_out_3)

    E_out_1 = np.mean(result['E1'], axis=0)
    E_out_2 = np.mean(result['E2'], axis=0)
    E_out_3 = np.mean(result['E3'], axis=0)

    #print("Uncertainty: ", E_out_1[::5].tolist())
    #print("Random: ", E_out_2[::5].tolist())
    #print("ALCE: ", E_out_3[::5].tolist())

    query_num = np.arange(0, quota + 1)
    plt.figure(figsize=(10, 8))
    plt.plot(query_num, E_out_1, 'g', label='Uncertainty sampling')
    plt.plot(query_num, E_out_2, 'k', label='Random')
    plt.plot(query_num, E_out_3, 'r', label='ALCE')
    plt.xlabel('Number of Queries')
    plt.ylabel('Error')
    plt.title('Experiment Result')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               fancybox=True,
               ncol=5)
    plt.show()
Пример #4
0
    def test_ALBLTestCase(self):
        trn_ds = Dataset(
            self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)]))
        qs = ActiveLearningByLearning(
            trn_ds,
            T=self.quota,
            query_strategies=[
                UncertaintySampling(trn_ds,
                                    model=SVM(kernel="linear",
                                              decision_function_shape="ovr")),
                QUIRE(trn_ds),
                RandomSampling(trn_ds)
            ],
            model=SVM(kernel="linear", decision_function_shape="ovr"),
            random_state=1126)

        qseq = run_qs(trn_ds, qs, self.y, self.quota)
        assert_array_equal(
            qseq, np.array([173, 103, 133, 184, 187, 147, 251, 83, 93, 33]))
Пример #5
0
 def test_hs_subsampling(self):
     ds = Dataset(self.X, self.y[:10] + [None] * (len(self.y) - 10))
     sub_qs = UncertaintySampling(ds,
                 model=SVM(gamma='auto', decision_function_shape='ovr'))
     qs = HS(ds, self.classes, subsample_qs=sub_qs, random_state=1126)
     qseq = run_qs(ds, qs, self.y, len(self.y)-10)
     assert_array_equal(
         np.concatenate([qseq[:10], qseq[-10:]]),
         np.array([120, 50, 33, 28, 78, 133, 52, 124, 102, 109,
                   81, 108, 10, 89, 126, 114, 92, 48, 25, 13])
         )
Пример #6
0
 def test_multilabel_with_auxiliary_learner_mmr(self):
     trn_ds = Dataset(self.X,
                      self.y[:5] + [None] * (len(self.y) - 5))
     qs = MultilabelWithAuxiliaryLearner(trn_ds,
             major_learner=BinaryRelevance(LogisticRegression(solver='liblinear',
                                                              multi_class="ovr")),
             auxiliary_learner=BinaryRelevance(SVM(gamma="auto")),
             criterion='mmr',
             random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(qseq,
             np.array([1258, 1461, 231, 1198, 1498, 1374, 955, 1367, 265, 144]))
 def test_multilabel_with_auxiliary_learner_hlr(self):
     trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5))
     qs = MultilabelWithAuxiliaryLearner(
         trn_ds,
         major_learner=BinaryRelevance(LogisticRegression()),
         auxiliary_learner=BinaryRelevance(SVM()),
         criterion='hlr',
         random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq,
         np.array([701, 1403, 147, 897, 974, 1266, 870, 703, 292, 1146]))
Пример #8
0
    def test_svm(self):
        svc_clf = SVC(gamma="auto")
        svc_clf.fit(self.X_train, self.y_train)
        svm = SVM(gamma="auto")
        svm.train(Dataset(self.X_train, self.y_train))

        assert_array_equal(svc_clf.predict(self.X_train),
                           svm.predict(self.X_train))
        assert_array_equal(svc_clf.predict(self.X_test),
                           svm.predict(self.X_test))
        self.assertEqual(svc_clf.score(self.X_train, self.y_train),
                         svm.score(Dataset(self.X_train, self.y_train)))
        self.assertEqual(svc_clf.score(self.X_test, self.y_test),
                         svm.score(Dataset(self.X_test, self.y_test)))
Пример #9
0
 def test_multilabel_with_auxiliary_learner_shlr(self):
     trn_ds = Dataset(self.X,
                      self.y[:5] + [None] * (len(self.y) - 5))
     qs = MultilabelWithAuxiliaryLearner(trn_ds,
             major_learner=BinaryRelevance(LogisticRegression(solver='liblinear',
                                                              multi_class="ovr")),
             auxiliary_learner=BinaryRelevance(SVM(gamma="auto")),
             criterion='shlr',
             b=1.,
             random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(qseq,
             np.array([1258, 805, 459, 550, 783, 964, 736, 1004, 38, 750]))
Пример #10
0
    def test_uncertainty_entropy_exceptions(self):
        trn_ds = init_toyexample(self.X, self.y)

        with self.assertRaises(TypeError):
            qs = UncertaintySampling(trn_ds, method='entropy', model=SVM())

        with self.assertRaises(TypeError):
            qs = UncertaintySampling(trn_ds,
                                     method='entropy',
                                     model=Perceptron())

        with self.assertRaises(TypeError):
            qs = UncertaintySampling(trn_ds,
                                     method='not_exist',
                                     model=LogisticRegression())
Пример #11
0
    def test_svm(self):
        svc_clf = SVC()
        svc_clf.fit(self.X_train, self.y_train)
        svm = SVM()
        svm.train(Dataset(self.X_train, self.y_train))

        assert_array_equal(
            svc_clf.predict(self.X_train), svm.predict(self.X_train))
        assert_array_equal(
            svc_clf.predict(self.X_test), svm.predict(self.X_test))
        self.assertEqual(
            svc_clf.score(self.X_train, self.y_train),
            svm.score(Dataset(self.X_train, self.y_train)))
        self.assertEqual(
            svc_clf.score(self.X_test, self.y_test),
            svm.score(Dataset(self.X_test, self.y_test)))
def run_active_learning():
    logger = SimpleLogger(LOG_FILE)
    dm = DataManager()
    im = InterpretableDataManager()
    drp_model = SVM(kernel=KERNEL, probability=True)
    lime_model = svm.SVC(kernel=KERNEL, probability=True)
    accs = [[], [], []]
    mccs = [[], [], []]

    labeled_indices = dm.get_labeled_indices()
    logger.log(0, labeled_indices)

    for strategy in STRATEGIES:
        trn_ds = dm.trn_ds_list[strategy]
        drp_model.train(trn_ds)
        update_accs_mccs(accs, mccs, dm, drp_model.model.predict, strategy)

    print_last_round_mcc(0, accs, mccs)
    assert (AL_ROUNDS <= len(dm.y_train) - INITIAL_INSTANCES)

    for round in xrange(1, AL_ROUNDS + 1):
        print "================================================="
        print "Round", round
        print "================================================="
        for strategy in STRATEGIES:
            trn_ds = dm.trn_ds_list[strategy]
            exclusion = set()
            batch = set()

            unlabeled_indices, unlabeled_X_scaled = zip(
                *trn_ds.get_unlabeled_entries())
            certainties = get_certainties(drp_model.model, dm.X_train_scaled)
            if strategy == EAL:
                threshold = get_certainty_threshold(drp_model.model,
                                                    dm.X_train_scaled,
                                                    THRESHOLD)
                y_certainty = discretize_certainties(certainties, threshold)

                lime_model.fit(dm.X_train_scaled_e, y_certainty)
                if SHOW_LIME:
                    certainties_test = get_certainties(drp_model.model,
                                                       dm.X_test_scaled)
                    y_certainty_test = discretize_certainties(
                        certainties_test, threshold)
                    print_lime_model_performance(lime_model, dm,
                                                 y_certainty_test)

                while (len(batch) < BATCH_SIZE):
                    query_id = query_least_confident(unlabeled_indices,
                                                     certainties, exclusion)
                    query = dm.X_train_scaled[query_id]
                    query_unscaled = dm.X_train_e[query_id]
                    instance_certainty = get_certainty(drp_model.model, query)
                    print "Explaining Query with id #{:d}".format(query_id)
                    print "Certainty {:.3f}".format(instance_certainty)

                    explainer = LimeTabularExplainer(
                        dm.X_train_e,
                        training_labels=y_certainty,
                        feature_names=dm.feature_names_e,
                        class_names=["uncertain", "certain"],
                        discretize_continuous=True,
                        discretizer="entropy")

                    predict_fn = lambda x: lime_model.predict_proba(
                        dm.scaler_e.transform(x)).astype(float)

                    for i in xrange(0, MAX_EXP_FEATURE, 2):
                        exp = explainer.explain_instance(
                            query_unscaled,
                            predict_fn,
                            num_features=NUM_FEATURES + i)
                        uncertain_exp_list = get_uncertain_exps(exp)
                        if (len(uncertain_exp_list) >= NUM_FEATURES - 2):
                            break
                        print "INFO: looping"

                    if SHOW_LIME:
                        print_lime_model_prediction(predict_fn, query_unscaled)

                    exp_indices = get_indices_exp_region(
                        exp, dm, unlabeled_indices, y_certainty)
                    exp_instances = get_values_of_indices(
                        exp_indices, dm.X_train_scaled)
                    exp_certainties = get_values_of_indices(
                        exp_indices, certainties)
                    batch_indices = select_batch(
                        min(BATCH_SIZE, BATCH_SIZE - len(batch)), exp_indices,
                        exp_instances, exp_certainties, "k-means-uncertain")

                    if len(batch_indices) == 0:
                        exclusion.add(query_id)
                        continue

                    print ""
                    print_explanation_drp(uncertain_exp_list, False)
                    print ""
                    print "Instances in the batch: {}".format(
                        len(batch_indices))
                    im.describe_instances(batch_indices)
                    print ""
                    im.describe_instance(query_id)
                    print ""

                    exclusion.update(set(exp_indices))
                    if ask_expert():
                        batch.update(set(batch_indices))
                    else:
                        print "INFO: Not including in the batch"

                logger.log(round, batch)
                print "INFO: Labeling the batch"
                label_batch(trn_ds, dm.y_train, batch)

            elif strategy == AL:  # AL + k-means-uncertain
                unlabeled_X_scaled = get_values_of_indices(
                    unlabeled_indices, dm.X_train_scaled)
                unlabeled_certainties = get_values_of_indices(
                    unlabeled_indices, certainties)
                batch_indices = select_batch(BATCH_SIZE, unlabeled_indices,
                                             unlabeled_X_scaled,
                                             unlabeled_certainties,
                                             "k-means-uncertain")
                label_batch(trn_ds, dm.y_train, batch_indices)

            elif strategy == PL:  # Passive Learning
                batch_indices = random.sample(unlabeled_indices, BATCH_SIZE)
                label_batch(trn_ds, dm.y_train, batch_indices)

            drp_model.train(trn_ds)
            update_accs_mccs(accs, mccs, dm, drp_model.model.predict, strategy)

    print_mcc_summary(mccs)
Пример #13
0
def main():
    test_size = 0.25  # the percentage of samples in the dataset that will be
    # randomly selected and assigned to the test set

    result = {'E1': [], 'E2': [], 'E3': [], 'E4': [], 'E5': [], 'E6': []}
    for i in range(10):  # repeat experiment
        trn_ds, tst_ds, fully_labeled_trn_ds = split_train_test(test_size)
        trn_ds2 = copy.deepcopy(trn_ds)
        trn_ds3 = copy.deepcopy(trn_ds)
        trn_ds4 = copy.deepcopy(trn_ds)
        trn_ds5 = copy.deepcopy(trn_ds)
        trn_ds6 = copy.deepcopy(trn_ds)
        lbr = IdealLabeler(fully_labeled_trn_ds)
        model = BinaryRelevance(LogisticRegression())

        quota = 150  # number of samples to query

        qs = MMC(trn_ds, br_base=LogisticRegression())
        _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota)
        result['E1'].append(E_out_1)

        qs2 = RandomSampling(trn_ds2)
        _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota)
        result['E2'].append(E_out_2)

        qs3 = MultilabelWithAuxiliaryLearner(trn_ds3,
                                             BinaryRelevance(
                                                 LogisticRegression()),
                                             BinaryRelevance(SVM()),
                                             criterion='hlr')
        _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota)
        result['E3'].append(E_out_3)

        qs4 = MultilabelWithAuxiliaryLearner(trn_ds4,
                                             BinaryRelevance(
                                                 LogisticRegression()),
                                             BinaryRelevance(SVM()),
                                             criterion='shlr')
        _, E_out_4 = run(trn_ds4, tst_ds, lbr, model, qs4, quota)
        result['E4'].append(E_out_4)

        qs5 = MultilabelWithAuxiliaryLearner(trn_ds5,
                                             BinaryRelevance(
                                                 LogisticRegression()),
                                             BinaryRelevance(SVM()),
                                             criterion='mmr')
        _, E_out_5 = run(trn_ds5, tst_ds, lbr, model, qs5, quota)
        result['E5'].append(E_out_5)

        qs6 = BinaryMinimization(trn_ds6, LogisticRegression())
        _, E_out_6 = run(trn_ds6, tst_ds, lbr, model, qs6, quota)
        result['E6'].append(E_out_6)

    E_out_1 = np.mean(result['E1'], axis=0)
    E_out_2 = np.mean(result['E2'], axis=0)
    E_out_3 = np.mean(result['E3'], axis=0)
    E_out_4 = np.mean(result['E4'], axis=0)
    E_out_5 = np.mean(result['E5'], axis=0)
    E_out_6 = np.mean(result['E6'], axis=0)

    print("MMC: ", E_out_1[::5].tolist())
    print("Random: ", E_out_2[::5].tolist())
    print("MultilabelWithAuxiliaryLearner_hlr: ", E_out_3[::5].tolist())
    print("MultilabelWithAuxiliaryLearner_shlr: ", E_out_4[::5].tolist())
    print("MultilabelWithAuxiliaryLearner_mmr: ", E_out_5[::5].tolist())
    print("BinaryMinimization: ", E_out_6[::5].tolist())

    query_num = np.arange(1, quota + 1)
    fig = plt.figure(figsize=(9, 6))
    ax = plt.subplot(111)
    ax.plot(query_num, E_out_1, 'g', label='MMC')
    ax.plot(query_num, E_out_2, 'k', label='Random')
    ax.plot(query_num, E_out_3, 'r', label='AuxiliaryLearner_hlr')
    ax.plot(query_num, E_out_4, 'b', label='AuxiliaryLearner_shlr')
    ax.plot(query_num, E_out_5, 'c', label='AuxiliaryLearner_mmr')
    ax.plot(query_num, E_out_6, 'm', label='BinaryMinimization')

    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.75, box.height])
    plt.legend(loc=2, bbox_to_anchor=(1.05, 1), borderaxespad=0.)
    plt.xlabel('Number of Queries')
    plt.ylabel('Loss')
    plt.title('Experiment Result (Hamming Loss)')
    plt.show()
Пример #14
0
MAIN FUNCTION
'''

result = {'Hamming': [],'F1': []}
    
model = BinaryRelevance(LogisticRegression())

quota = 20  # number of samples to query


#EXECUTE FROM HERE FOR ITERATIONS

qs1 = MultilabelWithAuxiliaryLearner(
trn_ds,
BinaryRelevance(LogisticRegression()),
BinaryRelevance(SVM()),
criterion='hlr')

run(data_CV_train,trn_ds, qs1, quota)

model.train(trn_ds)

X , y = zip(*tst_ds.get_labeled_entries())

pred = model.predict(X)

output = pd.DataFrame()
output['UE_pred'] = [pred[i][0] for i in range(len(pred))]
output['BR_pred'] = [pred[i][1] for i in range(len(pred))]
output['FR_pred'] = [pred[i][2] for i in range(len(pred))]
Пример #15
0
def main():
    global pos_filepath, dataset_filepath, csv_filepath, vectors_list, ids_list
    dataset_filepath = "/Users/dndesign/Desktop/active_learning/vecteurs_et_infos/vectors_2015.txt"
    csv_filepath = "/Users/dndesign/Desktop/active_learning/donnees/corpus_2015_id-time-text.csv"
    pos_filepath = "/Users/dndesign/Desktop/active_learning/donnees/oriane_pos_id-time-text.csv"
    vectors_list, ids_list = get_vectors_list(dataset_filepath)

    timestr = time.strftime("%Y%m%d_%H%M%S")
    text_file = codecs.open("task_" + str(timestr) + ".txt", "w", "utf-8")

    print("Loading data...")
    text_file.write("Loading data...\n")
    # Open this file
    t0 = time.time()
    file = openfile_txt(dataset_filepath)
    num_lines = sum(1 for line in file)
    print("Treating " + str(num_lines) + " entries...")
    text_file.write("Treating : %s entries...\n" % str(num_lines))

    # Number of queries to ask human to label
    quota = 10
    E_out1, E_out2, E_out3, E_out4, E_out6, E_out7 = [], [], [], [], [], []
    trn_ds, tst_ds = split_train_test(csv_filepath)

    model = SVM(kernel='linear')
    # model = LogisticRegression()

    ''' UncertaintySampling (Least Confident)
     
        UncertaintySampling : it queries the instances about which 
        it is least certain how to label
        
        Least Confident : it queries the instance whose posterior 
        probability of being positive is nearest 0.5
    '''
    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression(C=.01))
    model.train(trn_ds)
    E_out1 = np.append(E_out1, 1 - model.score(tst_ds))

    ''' UncertaintySampling (Max Margin) 

    '''
    trn_ds2 = copy.deepcopy(trn_ds)
    qs2 = USampling(trn_ds2, method='mm', model=SVM(kernel='linear'))
    model.train(trn_ds2)
    E_out2 = np.append(E_out2, 1 - model.score(tst_ds))

    ''' CMB Sampling   
        Combination of active learning algorithms (distance-based (DIST), diversity-based (DIV)) 
    '''
    trn_ds3 = copy.deepcopy(trn_ds)
    qs3 = CMBSampling(trn_ds3, model=SVM(kernel='linear'))
    model.train(trn_ds3)
    E_out3 = np.append(E_out3, 1 - model.score(tst_ds))

    ''' Random Sampling   
        Random : it chooses randomly a query
    '''
    trn_ds4 = copy.deepcopy(trn_ds)
    qs4 = RandomSampling(trn_ds4, random_state=1126)
    model.train(trn_ds4)
    E_out4 = np.append(E_out4, 1 - model.score(tst_ds))

    ''' QueryByCommittee (Vote Entropy)
    
        QueryByCommittee : it keeps a committee of classifiers and queries 
        the instance that the committee members disagree, it  also examines 
        unlabeled examples and selects only those that are most informative 
        for labeling
        
        Vote Entropy : a way of measuring disagreement 
        
        Disadvantage : it does not consider the committee members’ class 
        distributions. It also misses some informative unlabeled examples 
        to label 
    '''
    trn_ds6 = copy.deepcopy(trn_ds)
    qs6 = QueryByCommittee(trn_ds6, disagreement='vote',
                              models=[LogisticRegression(C=1.0),
                                      LogisticRegression(C=0.01),
                                      LogisticRegression(C=100)],
                              random_state=1126)
    model.train(trn_ds6)
    E_out6 = np.append(E_out6, 1 - model.score(tst_ds))

    ''' QueryByCommittee (Kullback-Leibler Divergence)
    
            QueryByCommittee : it examines unlabeled examples and selects only 
            those that are most informative for labeling
            
            Disadvantage :  it misses some examples on which committee members 
            disagree
    '''
    trn_ds7 = copy.deepcopy(trn_ds)
    qs7 = QueryByCommittee(trn_ds7, disagreement='kl_divergence',
                                  models=[LogisticRegression(C=1.0),
                                          LogisticRegression(C=0.01),
                                          LogisticRegression(C=100)],
                                  random_state=1126)
    model.train(trn_ds7)
    E_out7 = np.append(E_out7, 1 - model.score(tst_ds))

    with sns.axes_style("darkgrid"):
        fig = plt.figure()
        ax = fig.add_subplot(1, 1, 1)

    query_num = np.arange(0, 1)
    p1, = ax.plot(query_num, E_out1, 'red')
    p2, = ax.plot(query_num, E_out2, 'blue')
    p3, = ax.plot(query_num, E_out3, 'green')
    p4, = ax.plot(query_num, E_out4, 'orange')
    p6, = ax.plot(query_num, E_out6, 'black')
    p7, = ax.plot(query_num, E_out7, 'purple')
    plt.legend(('Least Confident', 'Max Margin', 'Distance Diversity CMB', 'Random Sampling', 'Vote Entropy', 'KL Divergence'), loc=1)
    plt.ylabel('Accuracy')
    plt.xlabel('Number of Queries')
    plt.title('Active Learning - Query choice strategies')
    plt.ylim([0, 1])
    plt.show(block=False)

    for i in range(quota):
        print("\n#################################################")
        print("Query number " + str(i) + " : ")
        print("#################################################\n")
        text_file.write("\n#################################################\n")
        text_file.write("Query number %s : " % str(i))
        text_file.write("\n#################################################\n")

        ask_id = qs.make_query()
        print("\033[4mUsing Uncertainty Sampling (Least confident) :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using Uncertainty Sampling (Least confident) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds)
        E_out1 = np.append(E_out1, 1 - model.score(tst_ds))

        ask_id = qs2.make_query()
        print("\033[4mUsing Uncertainty Sampling (Max Margin) :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using Uncertainty Sampling (Smallest Margin) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds2.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds2)
        E_out2 = np.append(E_out2, 1 - model.score(tst_ds))

        ask_id = qs3.make_query()
        print("\033[4mUsing CMB Distance-Diversity Sampling :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using Uncertainty Sampling (Entropy) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds3.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds3)
        E_out3 = np.append(E_out3, 1 - model.score(tst_ds))

        ask_id = qs4.make_query()
        print("\033[4mUsing Random Sampling :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using Random Sampling :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds4.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds4)
        E_out4 = np.append(E_out4, 1 - model.score(tst_ds))

        ask_id = qs6.make_query()
        print("\033[4mUsing QueryByCommittee (Vote Entropy) :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using QueryByCommittee (Vote Entropy) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds6.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds6)
        E_out6 = np.append(E_out6, 1 - model.score(tst_ds))

        ask_id = qs7.make_query()
        print("\033[4mUsing QueryByCommittee (KL Divergence) :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using QueryByCommittee (KL Divergence) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds7.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds7)
        E_out7 = np.append(E_out7, 1 - model.score(tst_ds))

        ax.set_xlim((0, i + 1))
        ax.set_ylim((0, max(max(E_out1), max(E_out2), max(E_out3), max(E_out4), max(E_out6), max(E_out7)) + 0.2))
        query_num = np.arange(0, i + 2)
        p1.set_xdata(query_num)
        p1.set_ydata(E_out1)
        p2.set_xdata(query_num)
        p2.set_ydata(E_out2)
        p3.set_xdata(query_num)
        p3.set_ydata(E_out3)
        p4.set_xdata(query_num)
        p4.set_ydata(E_out4)
        p6.set_xdata(query_num)
        p6.set_ydata(E_out6)
        p7.set_xdata(query_num)
        p7.set_ydata(E_out7)

        plt.draw()

    t2 = time.time()
    time_total = t2 - t0
    print("\n\n\n#################################################\n")
    print("Execution time : %fs \n\n" % time_total)
    text_file.write("\n\n\n#################################################\n")
    text_file.write("Execution time : %fs \n" % time_total)
    text_file.close()
    input("Press any key to save the plot...")
    plt.savefig('task_' + str(timestr) + '.png')

    print("Done")
Пример #16
0
def main():
    # Specifiy the parameters here:
    # path to your binary classification dataset
    ds_name = 'australian'
    dataset_filepath = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), '%s.txt' % ds_name)
    test_size = 0.33  # the percentage of samples in the dataset that will be
    # randomly selected and assigned to the test set
    n_labeled = 10  # number of samples that are initially labeled
    results = []

    for T in range(20):  # repeat the experiment 20 times
        print("%dth experiment" % (T + 1))

        trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \
            split_train_test(dataset_filepath, test_size, n_labeled)
        trn_ds2 = copy.deepcopy(trn_ds)
        trn_ds3 = copy.deepcopy(trn_ds)
        trn_ds4 = copy.deepcopy(trn_ds)
        trn_ds5 = copy.deepcopy(trn_ds)
        lbr = IdealLabeler(fully_labeled_trn_ds)

        quota = len(y_train) - n_labeled  # number of samples to query

        # Comparing UncertaintySampling strategy with RandomSampling.
        # model is the base learner, e.g. LogisticRegression, SVM ... etc.
        qs = UncertaintySampling(trn_ds,
                                 model=SVM(decision_function_shape='ovr'))
        model = SVM(kernel='linear', decision_function_shape='ovr')
        _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota)
        results.append(E_out_1.tolist())

        qs2 = RandomSampling(trn_ds2)
        model = SVM(kernel='linear', decision_function_shape='ovr')
        _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota)
        results.append(E_out_2.tolist())

        qs3 = QUIRE(trn_ds3)
        model = SVM(kernel='linear', decision_function_shape='ovr')
        _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota)
        results.append(E_out_3.tolist())

        qs4 = HintSVM(trn_ds4, cl=1.0, ch=1.0)
        model = SVM(kernel='linear', decision_function_shape='ovr')
        _, E_out_4 = run(trn_ds4, tst_ds, lbr, model, qs4, quota)
        results.append(E_out_4.tolist())

        qs5 = ActiveLearningByLearning(
            trn_ds5,
            query_strategies=[
                UncertaintySampling(trn_ds5,
                                    model=SVM(kernel='linear',
                                              decision_function_shape='ovr')),
                QUIRE(trn_ds5),
                HintSVM(trn_ds5, cl=1.0, ch=1.0),
            ],
            T=quota,
            uniform_sampler=True,
            model=SVM(kernel='linear', decision_function_shape='ovr'))
        model = SVM(kernel='linear', decision_function_shape='ovr')
        _, E_out_5 = run(trn_ds5, tst_ds, lbr, model, qs5, quota)
        results.append(E_out_5.tolist())

    result = []
    for i in range(5):
        _temp = []
        for j in range(i, len(results), 5):
            _temp.append(results[j])
        result.append(np.mean(_temp, axis=0))

    # Plot the learning curve of UncertaintySampling to RandomSampling
    # The x-axis is the number of queries, and the y-axis is the corresponding
    # error rate.
    query_num = np.arange(1, quota + 1)
    plt.plot(query_num, result[0], 'g', label='uncertainty sampling')
    plt.plot(query_num, result[1], 'k', label='random')
    plt.plot(query_num, result[2], 'r', label='QUIRE')
    plt.plot(query_num, result[3], 'b', label='HintSVM')
    plt.plot(query_num, result[4], 'c', label='ALBL')
    plt.xlabel('Number of Queries')
    plt.ylabel('Error')
    plt.title('Experiment Result')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               fancybox=True,
               shadow=True,
               ncol=5)
    plt.show()
Пример #17
0
def main():
    X_train, y_train = load_data(DATA_TRAIN)
    X_test, y_test = load_data(DATA_TEST)
    X_all, y_all = load_data(DATA_ALL)

    trn_ds_eal = make_active_learning_dataset(len(y_train), X_all, y_all)
    trn_ds_al = copy.deepcopy(trn_ds_eal)
    trn_ds_pl = copy.deepcopy(trn_ds_eal)
    svm_model = SVM(kernel=KERNEL, probability=True)

    trn_datasets = [trn_ds_al, trn_ds_eal, trn_ds_pl]
    accs_list = [[], [], []]
    mccs_list = [[], [], []]

    for strategy in STRATEGIES:
        trn_ds = trn_datasets[strategy]
        svm_model.train(trn_ds)
        acc, mcc = compute_acc_mcc(svm_model.model, X_test, y_test)
        accs_list[strategy].append(acc)
        mccs_list[strategy].append(mcc)

    for i in range(ROUNDS):
        for strategy in STRATEGIES:
            trn_ds = trn_datasets[strategy]
            svm_model.train(trn_ds)
            pool_indices, X_pool = zip(*trn_ds.get_unlabeled_entries())
            pool_indices = list(pool_indices)
            certainties = get_certainties(svm_model.model, X_pool)

            if strategy == AL:
                query_indices = select_batch(1, pool_indices, X_pool,
                                             certainties, "q-best")
                query_index = query_indices[0]
                x1, x2 = X_all[query_index]

            elif strategy == EAL:
                query_indices = select_batch(CANDIDATES, pool_indices, X_pool,
                                             certainties, "k-means-uncertain")
                query_indices_q2_q4 = []
                for q in query_indices:
                    x1, x2 = X_all[q]
                    if quadrant(x1, x2) in ["Q2", "Q4"]:
                        query_indices_q2_q4.append(q)

                if query_indices_q2_q4:
                    query_indices = query_indices_q2_q4

                query_index = query_indices[randint(0, len(query_indices) - 1)]

            elif strategy == PL:
                query_index = choice(pool_indices)
                x1, x2 = X_all[query_index]

            trn_ds.update(query_index, y_all[query_index])
            svm_model.train(trn_ds)
            acc, mcc = compute_acc_mcc(svm_model.model, X_test, y_test)
            accs_list[strategy].append(acc)
            mccs_list[strategy].append(mcc)

    for strategy in STRATEGIES:
        strategy_name = STRATEGIY_NAMES[strategy]
        accs_list[strategy] = map(lambda x: pretty_float(x),
                                  accs_list[strategy])
        mccs_list[strategy] = map(lambda x: pretty_float(x),
                                  mccs_list[strategy])
        print "{0}_ACC,".format(strategy_name) + ",".join(accs_list[strategy])
        print "{0}_MCC,".format(strategy_name) + ",".join(mccs_list[strategy])