Exemplo n.º 1
0
    def libact_first_try_second_run(self, enriched_train_df, extractor,
                                    ideal_df, lbr, quota, validation_data_df,
                                    return_dict):

        trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor)
        qs = UncertaintySampling(trn_ds,
                                 method='lc',
                                 model=LogisticRegression())
        E_out1 = []
        E_out1 = np.append(
            E_out1,
            run_classifier(trn_ds.extract_labeled_dataframe(),
                           validation_data_df).f1)
        for i in range(quota):
            if len(trn_ds.get_unlabeled_entries()) == 0:
                break  # finished labeling all examples
            ask_id = qs.make_query()
            lb = lbr.label(trn_ds.extract_sentence(ask_id))
            self.assertEqual(lb, ideal_df[cn.tag_col][ask_id])
            trn_ds.update(ask_id, lb)
            # model.train(trn_ds)
            E_out1 = np.append(
                E_out1,
                run_classifier(trn_ds.extract_labeled_dataframe(),
                               validation_data_df).f1)
        return_dict[2] = E_out1
    def make_query(self):

        tempDataset = copy.deepcopy(self.dataset)
        tempModel = copy.deepcopy(self.model)

        queryStrat = UncertaintySampling(tempDataset,
                                         model=tempModel)  #Model is fit here
        queryIds = []

        for j in range(self.batch_size_):
            queryId = queryStrat.make_query()  #Model is also fit here
            queryIds.append(queryId)

            features = tempDataset.get_entries()[queryId][0]

            probs = tempModel.predict_proba(features.reshape(1, -1))

            # hard coded flag for positive answer - need to improve
            if self.random_state_.rand() < probs[0][0]:
                label = 0
            else:
                label = 1

            tempDataset.update(queryId, label)

            # tempModel.train(tempDataset) #This is not needed,
            # since the make_query of UncertaintySampling fits

        return queryIds
Exemplo n.º 3
0
 def getUncertaintyIndex(self, trn_ds, method, clf):
     print "[Trainer-Selection] Get uncertainty sampling index."
     qs = UncertaintySampling(trn_ds, method=method, model=clf)
     _, score = qs.make_query(return_score=True)
     score_sorted = sorted(score, key=lambda x:x[1], reverse=True)
     result = []
     for index in score_sorted:
         result.append(self.unlabeled_index_[index[0]])
     return result
Exemplo n.º 4
0
def strategies_to_try(tp):
    if tp == 'uncertainty':
        return lambda trn_ds, libact_model: UncertaintySampling(trn_ds, model=libact_model, method='lc')
    elif tp == 'random':
        return lambda trn_ds, libact_model: RandomSamplingWithRetraining(trn_ds, model=libact_model, method='lc')
    elif tp == 'positivelesscertain':
        return lambda tr_ds, libact_model: UncertaintySampling(tr_ds, model=PositiveLessCertain(libact_model), method='lc')
    else:
        raise ValueError('Wrong strategy')
Exemplo n.º 5
0
    def __init__(self, X, y, labs, n=2):

        y = [yy if yy >= 0 else None for yy in y]

        self.dataset = Dataset(X, y)
        self.labs = labs

        self.uc = UncertaintySampling(self.dataset,
                                      method='lc',
                                      model=LinearSVC())
        self.n = n
Exemplo n.º 6
0
def main():
    quota = 10  # ask human to label 30 samples
    n_classes = 5
    E_out1, E_out2 = [], []

    trn_ds, tst_ds, ds = split_train_test(n_classes)
    trn_ds2 = copy.deepcopy(trn_ds)

    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())
    qs2 = RandomSampling(trn_ds2)

    model = LogisticRegression()

    fig = plt.figure()
    ax = fig.add_subplot(2, 1, 1)
    ax.set_xlabel('Number of Queries')
    ax.set_ylabel('Error')

    model.train(trn_ds)
    E_out1 = np.append(E_out1, 1 - model.score(tst_ds))
    model.train(trn_ds2)
    E_out2 = np.append(E_out2, 1 - model.score(tst_ds))

    query_num = np.arange(0, 1)
    p1, = ax.plot(query_num, E_out1, 'g', label='qs Eout')
    p2, = ax.plot(query_num, E_out2, 'k', label='random Eout')
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True,
               shadow=True, ncol=5)
    plt.show(block=False)

    img_ax = fig.add_subplot(2, 1, 2)
    box = img_ax.get_position()
    img_ax.set_position([box.x0, box.y0 - box.height * 0.1, box.width,
                         box.height * 0.9])
    # Give each label its name (labels are from 0 to n_classes-1)
    lbr = InteractiveLabeler(label_name=[str(lbl) for lbl in range(n_classes)])

    for i in range(quota):
        ask_id = qs.make_query()
        print("asking sample from Uncertainty Sampling")
        # reshape the image to its width and height
        lb = lbr.label(trn_ds.data[ask_id][0].reshape(8, 8))
        trn_ds.update(ask_id, lb)
        model.train(trn_ds)
        E_out1 = np.append(E_out1, 1 - model.score(tst_ds))

        ask_id = qs2.make_query()
        print("asking sample from Random Sample")
        lb = lbr.label(trn_ds2.data[ask_id][0].reshape(8, 8))
        trn_ds2.update(ask_id, lb)
        model.train(trn_ds2)
        E_out2 = np.append(E_out2, 1 - model.score(tst_ds))
Exemplo n.º 7
0
    def test_uncertainty_entropy_exceptions(self):
        trn_ds = init_toyexample(self.X, self.y)

        with self.assertRaises(TypeError):
            qs = UncertaintySampling(trn_ds, method='entropy', model=SVM())

        with self.assertRaises(TypeError):
            qs = UncertaintySampling(trn_ds,
                                     method='entropy',
                                     model=Perceptron())

        with self.assertRaises(TypeError):
            qs = UncertaintySampling(trn_ds,
                                     method='not_exist',
                                     model=LogisticRegression())
Exemplo n.º 8
0
 def build_query_strategy(sent_df, col_names):
     # type: (DataFrame, ColumnNames) -> QueryStrategy
     """
     Builds and returns a QueryStrategy
         using a feature extractor and a base_df
     """
     init_extractor = SynStateALHeuristic.build_feature_extractor(
         sent_df, col_names)
     combined_features = init_extractor.transform(sent_df, col_names)
     trn_ds = TextDataset(sent_df,
                          col_names,
                          None,
                          features=combined_features)
     return ActiveLearningByLearning(
         trn_ds,
         query_strategies=[
             UncertaintySampling(trn_ds,
                                 model=SVM(C=100,
                                           gamma=3.1,
                                           kernel='rbf',
                                           decision_function_shape='ovr')),
             QUIRE(trn_ds),
             HintSVM(trn_ds, cl=1.0, ch=1.0),
         ],
         T=1000,
         uniform_sampler=True,
         model=SVM(C=100,
                   gamma=3.1,
                   kernel='rbf',
                   decision_function_shape='ovr'))
Exemplo n.º 9
0
def train_for_user(user_id=None, device_type=None, n_class=None):
    test_data = waterloo_iv_processing.get_per_user_data(
        user_id=user_id,
        device=device_type,
        video_name=['sports', 'document', 'nature', 'game', 'movie'])
    X, y = processing_training_data(n_class=n_class, train_data=test_data)
    test_size = 0.2  # the percentage of samples in the dataset that will be
    quota = 350  # number of samples to query

    result = {'E1': [], 'E2': [], 'E3': []}
    for i in range(20):
        print('exp:', i)
        trn_ds, tst_ds, fully_labeled_trn_ds, cost_matrix = split_train_test(
            X=X, y=y, test_size=test_size, n_class=n_class)
        trn_ds2 = copy.deepcopy(trn_ds)
        trn_ds3 = copy.deepcopy(trn_ds)
        lbr = IdealLabeler(fully_labeled_trn_ds)
        model = SVM(kernel='rbf', decision_function_shape='ovr')

        qs = UncertaintySampling(trn_ds,
                                 method='sm',
                                 model=SVM(decision_function_shape='ovr'))
        _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota, cost_matrix)
        result['E1'].append(E_out_1)

        qs2 = RandomSampling(trn_ds2)
        _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota, cost_matrix)
        result['E2'].append(E_out_2)

        qs3 = ALCE(trn_ds3, cost_matrix, SVR())
        _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota, cost_matrix)
        result['E3'].append(E_out_3)

    E_out_1 = np.mean(result['E1'], axis=0)
    E_out_2 = np.mean(result['E2'], axis=0)
    E_out_3 = np.mean(result['E3'], axis=0)

    save_file(
        'results/' + device_type + '_user_' + str(user_id) + '_E1_class_' +
        str(n_class) + '.txt', result['E1'])
    save_file(
        'results/' + device_type + '_user_' + str(user_id) + '_E2_class_' +
        str(n_class) + '.txt', result['E2'])
    save_file(
        'results/' + device_type + '_user_' + str(user_id) + '_E3_class_' +
        str(n_class) + '.txt', result['E3'])

    print("Uncertainty: ", E_out_1[::5].tolist())
    print("Random: ", E_out_2[::5].tolist())
    print("ALCE: ", E_out_3[::5].tolist())

    query_num = np.arange(0, quota + 1)
    uncert, = plt.plot(query_num, E_out_1, 'g', label='Uncertainty sampling')
    rd, = plt.plot(query_num, E_out_2, 'k', label='Random')
    alce, = plt.plot(query_num, E_out_3, 'r', label='ALCE')
    plt.xlabel('Number of Queries')
    plt.ylabel('Error')
    plt.title('Experiment Result (user ' + str(user_id) + ')')
    plt.legend(handles=[uncert, rd, alce], loc=3)
    plt.show()
Exemplo n.º 10
0
def score_per_add_al(labeled_pool_df, base_training_df, validation_data_df):
    # type: (DataFrame, DataFrame, DataFrame) -> tuple

    gen_pool_df = labeled_pool_df.copy(deep=True)
    gen_pool_df[cn.col_names.tag] = [np.NaN] * len(
        gen_pool_df)  # clear all tags
    enriched_train_df = pd.concat([base_training_df, gen_pool_df],
                                  ignore_index=True)

    extractor = cn.Feature_Extractor(
        enriched_train_df, cn.col_names)  # build the feature extractor

    trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor)

    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())

    ideal_df = pd.concat([base_training_df, labeled_pool_df],
                         ignore_index=True)
    lbr = IdealTextLabeler(TextDataset(ideal_df, cn.col_names, extractor))

    scoring_fun = lambda ds: run_classifier(ds.extract_labeled_dataframe(),
                                            validation_data_df)
    ex_added_list, res_list = run_active_learning(
        trn_ds, scoring_fun, lbr, qs, len(enriched_train_df))  # label all df

    return ex_added_list, res_list
Exemplo n.º 11
0
 def test_uncertainty_entropy(self):
     trn_ds = init_toyexample(self.X, self.y)
     qs = UncertaintySampling(trn_ds,
                              method='entropy',
                              model=LogisticRegression())
     model = LogisticRegression()
     qseq = run_qs(trn_ds, self.lbr, model, qs, self.quota)
     assert_array_equal(qseq, np.array([6, 7, 8, 9]))
Exemplo n.º 12
0
 def test_uncertainty_sm(self):
     trn_ds = init_toyexample(self.X, self.y)
     qs = UncertaintySampling(trn_ds,
                              method='sm',
                              model=LogisticRegression(solver='liblinear',
                                                       multi_class="ovr"))
     model = LogisticRegression(solver='liblinear', multi_class="ovr")
     qseq = run_qs(trn_ds, self.lbr, model, qs, self.quota)
     assert_array_equal(qseq, np.array([6, 7, 8, 9]))
Exemplo n.º 13
0
 def test_UcertaintySamplingSm(self):
     random.seed(1126)
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)]))
     qs = UncertaintySampling(trn_ds,
                              method='sm',
                              model=LogisticRegression())
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([145, 66, 82, 37, 194, 60, 191, 211, 245, 131]))
Exemplo n.º 14
0
def libact_uncertainty(X, y, n_queries):
    y_train = np.array([None for _ in range(len(y))])
    y_train[0], y_train[50], y_train[100] = 0, 1, 2
    libact_train_dataset = Dataset(X, y_train)
    libact_full_dataset = Dataset(X, y)
    libact_learner = LogisticRegressionLibact(
        solver='liblinear', n_jobs=1,
        multi_class='ovr')  #SVM(gamma='auto', probability=True)
    libact_qs = UncertaintySampling(libact_train_dataset,
                                    model=libact_learner,
                                    method='lc')
    libact_labeler = IdealLabeler(libact_full_dataset)
    libact_learner.train(libact_train_dataset)

    for _ in range(n_queries):
        query_idx = libact_qs.make_query()
        query_label = libact_labeler.label(X[query_idx])
        libact_train_dataset.update(query_idx, query_label)
        libact_learner.train(libact_train_dataset)
Exemplo n.º 15
0
 def build_query_strategy(sent_df, col_names):
     # type: (DataFrame, ColumnNames) -> QueryStrategy
     """
     Builds and returns a QueryStrategy
         using a feature extractor and a base_df
     """
     init_extractor = SynStateALHeuristic.build_feature_extractor(sent_df, col_names)
     combined_features = init_extractor.transform(sent_df, col_names)
     return UncertaintySampling(TextDataset(sent_df, col_names, None, features=combined_features),
                                method='lc', model=LogisticRegression())
Exemplo n.º 16
0
 def test_hs_subsampling(self):
     ds = Dataset(self.X, self.y[:10] + [None] * (len(self.y) - 10))
     sub_qs = UncertaintySampling(ds,
                 model=SVM(gamma='auto', decision_function_shape='ovr'))
     qs = HS(ds, self.classes, subsample_qs=sub_qs, random_state=1126)
     qseq = run_qs(ds, qs, self.y, len(self.y)-10)
     assert_array_equal(
         np.concatenate([qseq[:10], qseq[-10:]]),
         np.array([120, 50, 33, 28, 78, 133, 52, 124, 102, 109,
                   81, 108, 10, 89, 126, 114, 92, 48, 25, 13])
         )
Exemplo n.º 17
0
    def libact_first_try_first_run(self, enriched_train_df, extractor, lbr,
                                   quota, validation_data_df, return_dict):

        trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor)
        qs = UncertaintySampling(trn_ds,
                                 method='lc',
                                 model=LogisticRegression())
        scoring_fun = lambda ds: run_classifier(ds.extract_labeled_dataframe(),
                                                validation_data_df).f1
        query_num, E_out1 = run_active_learning(trn_ds, scoring_fun, lbr, qs,
                                                quota)
        return_dict[1] = E_out1
Exemplo n.º 18
0
class UncertaintySampler(object):
    def __init__(self, X, y, labs, n=2):

        y = [yy if yy >= 0 else None for yy in y]

        self.dataset = Dataset(X, y)
        self.labs = labs

        self.uc = UncertaintySampling(self.dataset,
                                      method='lc',
                                      model=LinearSVC())
        self.n = n

    def get_next(self):
        print >> sys.stderr, 'get_next: start'
        out = self.uc.make_query(n=self.n)
        print >> sys.stderr, 'get_next: done'
        return out

    def set_label(self, idx, label):
        print >> sys.stderr, 'set_label: start'
        out = self.dataset.update(idx, label)
        print >> sys.stderr, 'set_label: done'
        return out

    def get_data(self):
        X, y = zip(*self.dataset.get_entries())
        X, y = np.vstack(X), np.array(
            [yy if yy is not None else -1 for yy in y])
        return X, y

    def n_hits(self):
        labels = np.array(zip(*self.dataset.get_entries())[1])
        return (labels == 1).sum()

    def n_labeled(self):
        return self.dataset.len_labeled()

    def is_labeled(self, idx):
        return idx in np.where(zip(*self.dataset.get_entries())[1])[0]

    def save(self, outpath):
        """ !! This should be updated to save in same format as simple_las """
        X, y = self.get_data()

        f = h5py.File(
            '%s-%s-%s.h5' %
            (outpath, 'uncertainty', datetime.now().strftime('%Y%m%d_%H%M%S')))
        f['X'] = X
        f['y'] = y
        f['labs'] = self.labs
        f.close()
Exemplo n.º 19
0
def main():
    test_size = 0.25  # the percentage of samples in the dataset that will be
    # randomly selected and assigned to the test set

    result = {'E1': [], 'E2': [], 'E3': []}
    for i in range(2):
        trn_ds, tst_ds, fully_labeled_trn_ds, cost_matrix = \
            split_train_test(test_size)
        trn_ds2 = copy.deepcopy(trn_ds)
        trn_ds3 = copy.deepcopy(trn_ds)
        lbr = IdealLabeler(fully_labeled_trn_ds)
        model = SVM(kernel='rbf', decision_function_shape='ovr')

        quota = 100  # number of samples to query

        qs = UncertaintySampling(trn_ds,
                                 method='sm',
                                 model=SVM(decision_function_shape='ovr'))
        _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota, cost_matrix)
        result['E1'].append(E_out_1)

        qs2 = RandomSampling(trn_ds2)
        _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota, cost_matrix)
        result['E2'].append(E_out_2)

        qs3 = ALCE(trn_ds3, cost_matrix, SVR())
        _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota, cost_matrix)
        result['E3'].append(E_out_3)

    E_out_1 = np.mean(result['E1'], axis=0)
    E_out_2 = np.mean(result['E2'], axis=0)
    E_out_3 = np.mean(result['E3'], axis=0)

    #print("Uncertainty: ", E_out_1[::5].tolist())
    #print("Random: ", E_out_2[::5].tolist())
    #print("ALCE: ", E_out_3[::5].tolist())

    query_num = np.arange(0, quota + 1)
    plt.figure(figsize=(10, 8))
    plt.plot(query_num, E_out_1, 'g', label='Uncertainty sampling')
    plt.plot(query_num, E_out_2, 'k', label='Random')
    plt.plot(query_num, E_out_3, 'r', label='ALCE')
    plt.xlabel('Number of Queries')
    plt.ylabel('Error')
    plt.title('Experiment Result')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               fancybox=True,
               ncol=5)
    plt.show()
Exemplo n.º 20
0
 def test_ActiveLearningByLearning(self):
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)]))
     qs = ActiveLearningByLearning(trn_ds,
                                   T=self.quota,
                                   query_strategies=[
                                       UncertaintySampling(
                                           trn_ds,
                                           model=LogisticRegression()),
                                       HintSVM(trn_ds, random_state=1126)
                                   ],
                                   model=LogisticRegression(),
                                   random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([173, 103, 133, 184, 187, 147, 251, 83, 93, 33]))
Exemplo n.º 21
0
def main():
    # Specifiy the parameters here:
    # path to your binary classification dataset
    dataset_filepath = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), 'diabetes.txt')
    test_size = 0.33  # the percentage of samples in the dataset that will be
    # randomly selected and assigned to the test set
    n_labeled = 10  # number of samples that are initially labeled

    # Load dataset
    trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \
        split_train_test(dataset_filepath, test_size, n_labeled)
    trn_ds2 = copy.deepcopy(trn_ds)
    lbr = IdealLabeler(fully_labeled_trn_ds)

    quota = len(y_train) - n_labeled  # number of samples to query

    # Comparing UncertaintySampling strategy with RandomSampling.
    # model is the base learner, e.g. LogisticRegression, SVM ... etc.
    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())
    model = LogisticRegression()
    E_in_1, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota)

    qs2 = RandomSampling(trn_ds2)
    model = LogisticRegression()
    E_in_2, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota)

    # Plot the learning curve of UncertaintySampling to RandomSampling
    # The x-axis is the number of queries, and the y-axis is the corresponding
    # error rate.
    query_num = np.arange(1, quota + 1)
    plt.plot(query_num, E_in_1, 'b', label='qs Ein')
    plt.plot(query_num, E_in_2, 'r', label='random Ein')
    plt.plot(query_num, E_out_1, 'g', label='qs Eout')
    plt.plot(query_num, E_out_2, 'k', label='random Eout')
    plt.xlabel('Number of Queries')
    plt.ylabel('Error')
    plt.title('Experiment Result')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               fancybox=True,
               shadow=True,
               ncol=5)
    plt.show()
Exemplo n.º 22
0
    def test_ALBLTestCase(self):
        trn_ds = Dataset(
            self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)]))
        qs = ActiveLearningByLearning(
            trn_ds,
            T=self.quota,
            query_strategies=[
                UncertaintySampling(trn_ds,
                                    model=SVM(kernel="linear",
                                              decision_function_shape="ovr")),
                QUIRE(trn_ds),
                RandomSampling(trn_ds)
            ],
            model=SVM(kernel="linear", decision_function_shape="ovr"),
            random_state=1126)

        qseq = run_qs(trn_ds, qs, self.y, self.quota)
        assert_array_equal(
            qseq, np.array([173, 103, 133, 184, 187, 147, 251, 83, 93, 33]))
Exemplo n.º 23
0
 def test_density_weighted_meta_uncertainty_lc(self):
     trn_ds = Dataset(self.X[:20], np.concatenate([self.y[:6],
                                                   [None] * 14]))
     base_qs = UncertaintySampling(trn_ds,
                                   method='lc',
                                   model=LogisticRegression(
                                       solver='liblinear',
                                       multi_class="ovr"))
     similarity_metric = cosine_similarity
     clustering_method = KMeans(n_clusters=3, random_state=1126)
     qs = DensityWeightedMeta(dataset=trn_ds,
                              base_query_strategy=base_qs,
                              similarity_metric=similarity_metric,
                              clustering_method=clustering_method,
                              beta=1.0,
                              random_state=1126)
     model = LogisticRegression(solver='liblinear', multi_class="ovr")
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(qseq,
                        np.array([13, 18, 9, 12, 8, 16, 10, 19, 15, 17]))
Exemplo n.º 24
0
def initialQuerySetup(train_dataset,
                      queryStrategyID,
                      queryParams=None,
                      fixRandomState=False):

    if queryStrategyID == 0:
        queryStrategy = RandomSampling(train_dataset,random_state=137 \
                                       if fixRandomState else None)

    elif queryStrategyID == 1:
        queryStrategy = UncertaintySampling(train_dataset,
                                            method='sm',
                                            model=queryParams[0])

    elif queryStrategyID == 2:
        queryStrategy = QueryByCommittee(train_dataset,
                                         models=queryParams[0],
                                         disagreement='vote',
                                         random_state=23 \
                                         if fixRandomState else None)
    elif queryStrategyID == 3:
        queryStrategy = RandomBatchQuery(train_dataset,
                                         batch_size=queryParams[0],
                                         random_state=2311 \
                                         if fixRandomState else None)

    elif queryStrategyID == 4:
        queryStrategy = LeastCertainBatchQuery(train_dataset,
                                               model=queryParams[0],
                                               batch_size=queryParams[1],
                                               random_state=2317 \
                                               if fixRandomState else None)

    elif queryStrategyID == 5:
        queryStrategy = SemiSupervisedBatchQuery(train_dataset,
                                                 model=queryParams[0],
                                                 batch_size=queryParams[1],
                                                 random_state=3112 \
                                                 if fixRandomState else None)

    return queryStrategy
Exemplo n.º 25
0
def main():
    # Specifiy the parameters here:
    # path to your binary classification dataset
    ds_name = 'australian'
    dataset_filepath = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), '%s.txt' % ds_name)
    test_size = 0.33  # the percentage of samples in the dataset that will be
    # randomly selected and assigned to the test set
    n_labeled = 10  # number of samples that are initially labeled
    results = []

    for T in range(20):  # repeat the experiment 20 times
        print("%dth experiment" % (T + 1))

        trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \
            split_train_test(dataset_filepath, test_size, n_labeled)
        trn_ds2 = copy.deepcopy(trn_ds)
        trn_ds3 = copy.deepcopy(trn_ds)
        trn_ds4 = copy.deepcopy(trn_ds)
        trn_ds5 = copy.deepcopy(trn_ds)
        lbr = IdealLabeler(fully_labeled_trn_ds)

        quota = len(y_train) - n_labeled  # number of samples to query

        # Comparing UncertaintySampling strategy with RandomSampling.
        # model is the base learner, e.g. LogisticRegression, SVM ... etc.
        qs = UncertaintySampling(trn_ds,
                                 model=SVM(decision_function_shape='ovr'))
        model = SVM(kernel='linear', decision_function_shape='ovr')
        _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota)
        results.append(E_out_1.tolist())

        qs2 = RandomSampling(trn_ds2)
        model = SVM(kernel='linear', decision_function_shape='ovr')
        _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota)
        results.append(E_out_2.tolist())

        qs3 = QUIRE(trn_ds3)
        model = SVM(kernel='linear', decision_function_shape='ovr')
        _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota)
        results.append(E_out_3.tolist())

        qs4 = HintSVM(trn_ds4, cl=1.0, ch=1.0)
        model = SVM(kernel='linear', decision_function_shape='ovr')
        _, E_out_4 = run(trn_ds4, tst_ds, lbr, model, qs4, quota)
        results.append(E_out_4.tolist())

        qs5 = ActiveLearningByLearning(
            trn_ds5,
            query_strategies=[
                UncertaintySampling(trn_ds5,
                                    model=SVM(kernel='linear',
                                              decision_function_shape='ovr')),
                QUIRE(trn_ds5),
                HintSVM(trn_ds5, cl=1.0, ch=1.0),
            ],
            T=quota,
            uniform_sampler=True,
            model=SVM(kernel='linear', decision_function_shape='ovr'))
        model = SVM(kernel='linear', decision_function_shape='ovr')
        _, E_out_5 = run(trn_ds5, tst_ds, lbr, model, qs5, quota)
        results.append(E_out_5.tolist())

    result = []
    for i in range(5):
        _temp = []
        for j in range(i, len(results), 5):
            _temp.append(results[j])
        result.append(np.mean(_temp, axis=0))

    # Plot the learning curve of UncertaintySampling to RandomSampling
    # The x-axis is the number of queries, and the y-axis is the corresponding
    # error rate.
    query_num = np.arange(1, quota + 1)
    plt.plot(query_num, result[0], 'g', label='uncertainty sampling')
    plt.plot(query_num, result[1], 'k', label='random')
    plt.plot(query_num, result[2], 'r', label='QUIRE')
    plt.plot(query_num, result[3], 'b', label='HintSVM')
    plt.plot(query_num, result[4], 'c', label='ALBL')
    plt.xlabel('Number of Queries')
    plt.ylabel('Error')
    plt.title('Experiment Result')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               fancybox=True,
               shadow=True,
               ncol=5)
    plt.show()
Exemplo n.º 26
0
def main():
    global pos_filepath, dataset_filepath, csv_filepath, vectors_list, ids_list
    dataset_filepath = "/Users/dndesign/Desktop/active_learning/vecteurs_et_infos/vectors_2015.txt"
    csv_filepath = "/Users/dndesign/Desktop/active_learning/donnees/corpus_2015_id-time-text.csv"
    pos_filepath = "/Users/dndesign/Desktop/active_learning/donnees/oriane_pos_id-time-text.csv"
    vectors_list, ids_list = get_vectors_list(dataset_filepath)

    timestr = time.strftime("%Y%m%d_%H%M%S")
    text_file = codecs.open("task_" + str(timestr) + ".txt", "w", "utf-8")

    print("Loading data...")
    text_file.write("Loading data...\n")
    # Open this file
    t0 = time.time()
    file = openfile_txt(dataset_filepath)
    num_lines = sum(1 for line in file)
    print("Treating " + str(num_lines) + " entries...")
    text_file.write("Treating : %s entries...\n" % str(num_lines))

    # Number of queries to ask human to label
    quota = 10
    E_out1, E_out2, E_out3, E_out4, E_out6, E_out7 = [], [], [], [], [], []
    trn_ds, tst_ds = split_train_test(csv_filepath)

    model = SVM(kernel='linear')
    # model = LogisticRegression()

    ''' UncertaintySampling (Least Confident)
     
        UncertaintySampling : it queries the instances about which 
        it is least certain how to label
        
        Least Confident : it queries the instance whose posterior 
        probability of being positive is nearest 0.5
    '''
    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression(C=.01))
    model.train(trn_ds)
    E_out1 = np.append(E_out1, 1 - model.score(tst_ds))

    ''' UncertaintySampling (Max Margin) 

    '''
    trn_ds2 = copy.deepcopy(trn_ds)
    qs2 = USampling(trn_ds2, method='mm', model=SVM(kernel='linear'))
    model.train(trn_ds2)
    E_out2 = np.append(E_out2, 1 - model.score(tst_ds))

    ''' CMB Sampling   
        Combination of active learning algorithms (distance-based (DIST), diversity-based (DIV)) 
    '''
    trn_ds3 = copy.deepcopy(trn_ds)
    qs3 = CMBSampling(trn_ds3, model=SVM(kernel='linear'))
    model.train(trn_ds3)
    E_out3 = np.append(E_out3, 1 - model.score(tst_ds))

    ''' Random Sampling   
        Random : it chooses randomly a query
    '''
    trn_ds4 = copy.deepcopy(trn_ds)
    qs4 = RandomSampling(trn_ds4, random_state=1126)
    model.train(trn_ds4)
    E_out4 = np.append(E_out4, 1 - model.score(tst_ds))

    ''' QueryByCommittee (Vote Entropy)
    
        QueryByCommittee : it keeps a committee of classifiers and queries 
        the instance that the committee members disagree, it  also examines 
        unlabeled examples and selects only those that are most informative 
        for labeling
        
        Vote Entropy : a way of measuring disagreement 
        
        Disadvantage : it does not consider the committee members’ class 
        distributions. It also misses some informative unlabeled examples 
        to label 
    '''
    trn_ds6 = copy.deepcopy(trn_ds)
    qs6 = QueryByCommittee(trn_ds6, disagreement='vote',
                              models=[LogisticRegression(C=1.0),
                                      LogisticRegression(C=0.01),
                                      LogisticRegression(C=100)],
                              random_state=1126)
    model.train(trn_ds6)
    E_out6 = np.append(E_out6, 1 - model.score(tst_ds))

    ''' QueryByCommittee (Kullback-Leibler Divergence)
    
            QueryByCommittee : it examines unlabeled examples and selects only 
            those that are most informative for labeling
            
            Disadvantage :  it misses some examples on which committee members 
            disagree
    '''
    trn_ds7 = copy.deepcopy(trn_ds)
    qs7 = QueryByCommittee(trn_ds7, disagreement='kl_divergence',
                                  models=[LogisticRegression(C=1.0),
                                          LogisticRegression(C=0.01),
                                          LogisticRegression(C=100)],
                                  random_state=1126)
    model.train(trn_ds7)
    E_out7 = np.append(E_out7, 1 - model.score(tst_ds))

    with sns.axes_style("darkgrid"):
        fig = plt.figure()
        ax = fig.add_subplot(1, 1, 1)

    query_num = np.arange(0, 1)
    p1, = ax.plot(query_num, E_out1, 'red')
    p2, = ax.plot(query_num, E_out2, 'blue')
    p3, = ax.plot(query_num, E_out3, 'green')
    p4, = ax.plot(query_num, E_out4, 'orange')
    p6, = ax.plot(query_num, E_out6, 'black')
    p7, = ax.plot(query_num, E_out7, 'purple')
    plt.legend(('Least Confident', 'Max Margin', 'Distance Diversity CMB', 'Random Sampling', 'Vote Entropy', 'KL Divergence'), loc=1)
    plt.ylabel('Accuracy')
    plt.xlabel('Number of Queries')
    plt.title('Active Learning - Query choice strategies')
    plt.ylim([0, 1])
    plt.show(block=False)

    for i in range(quota):
        print("\n#################################################")
        print("Query number " + str(i) + " : ")
        print("#################################################\n")
        text_file.write("\n#################################################\n")
        text_file.write("Query number %s : " % str(i))
        text_file.write("\n#################################################\n")

        ask_id = qs.make_query()
        print("\033[4mUsing Uncertainty Sampling (Least confident) :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using Uncertainty Sampling (Least confident) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds)
        E_out1 = np.append(E_out1, 1 - model.score(tst_ds))

        ask_id = qs2.make_query()
        print("\033[4mUsing Uncertainty Sampling (Max Margin) :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using Uncertainty Sampling (Smallest Margin) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds2.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds2)
        E_out2 = np.append(E_out2, 1 - model.score(tst_ds))

        ask_id = qs3.make_query()
        print("\033[4mUsing CMB Distance-Diversity Sampling :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using Uncertainty Sampling (Entropy) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds3.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds3)
        E_out3 = np.append(E_out3, 1 - model.score(tst_ds))

        ask_id = qs4.make_query()
        print("\033[4mUsing Random Sampling :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using Random Sampling :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds4.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds4)
        E_out4 = np.append(E_out4, 1 - model.score(tst_ds))

        ask_id = qs6.make_query()
        print("\033[4mUsing QueryByCommittee (Vote Entropy) :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using QueryByCommittee (Vote Entropy) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds6.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds6)
        E_out6 = np.append(E_out6, 1 - model.score(tst_ds))

        ask_id = qs7.make_query()
        print("\033[4mUsing QueryByCommittee (KL Divergence) :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using QueryByCommittee (KL Divergence) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds7.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds7)
        E_out7 = np.append(E_out7, 1 - model.score(tst_ds))

        ax.set_xlim((0, i + 1))
        ax.set_ylim((0, max(max(E_out1), max(E_out2), max(E_out3), max(E_out4), max(E_out6), max(E_out7)) + 0.2))
        query_num = np.arange(0, i + 2)
        p1.set_xdata(query_num)
        p1.set_ydata(E_out1)
        p2.set_xdata(query_num)
        p2.set_ydata(E_out2)
        p3.set_xdata(query_num)
        p3.set_ydata(E_out3)
        p4.set_xdata(query_num)
        p4.set_ydata(E_out4)
        p6.set_xdata(query_num)
        p6.set_ydata(E_out6)
        p7.set_xdata(query_num)
        p7.set_ydata(E_out7)

        plt.draw()

    t2 = time.time()
    time_total = t2 - t0
    print("\n\n\n#################################################\n")
    print("Execution time : %fs \n\n" % time_total)
    text_file.write("\n\n\n#################################################\n")
    text_file.write("Execution time : %fs \n" % time_total)
    text_file.close()
    input("Press any key to save the plot...")
    plt.savefig('task_' + str(timestr) + '.png')

    print("Done")
Exemplo n.º 27
0
def main():
    quota = 10  # ask human to label 10 samples
    n_classes = 5
    E_out1, E_out2 = [], []

    trn_ds, tst_ds, ds = split_train_test(n_classes)
    trn_ds2 = copy.deepcopy(trn_ds)
    # print(trn_ds.get_entries())
    # print(len(trn_ds))
    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())
    qs2 = RandomSampling(trn_ds2)

    model = LogisticRegression()

    fig = plt.figure()
    ax = fig.add_subplot(2, 1, 1)
    ax.set_xlabel('Number of Queries')
    ax.set_ylabel('Error')

    model.train(trn_ds)
    E_out1 = np.append(E_out1, 1 - model.score(tst_ds))
    model.train(trn_ds2)
    E_out2 = np.append(E_out2, 1 - model.score(tst_ds))

    query_num = np.arange(0, 1)
    p1, = ax.plot(query_num, E_out1, 'g', label='qs Eout')
    p2, = ax.plot(query_num, E_out2, 'k', label='random Eout')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               fancybox=True,
               shadow=True,
               ncol=5)
    plt.show(block=False)

    img_ax = fig.add_subplot(2, 1, 2)
    box = img_ax.get_position()
    img_ax.set_position(
        [box.x0, box.y0 - box.height * 0.1, box.width, box.height * 0.9])
    # Give each label its name (labels are from 0 to n_classes-1)
    lbr = InteractiveLabeler(label_name=[str(lbl) for lbl in range(n_classes)])

    for i in range(quota):
        ask_id = qs.make_query()
        print("asking sample from Uncertainty Sampling")
        # reshape the image to its width and height
        lb = lbr.label(trn_ds.data[ask_id][0].reshape(8, 8))
        trn_ds.update(ask_id, lb)
        model.train(trn_ds)
        E_out1 = np.append(E_out1, 1 - model.score(tst_ds))

        ask_id = qs2.make_query()
        print("asking sample from Random Sample")
        lb = lbr.label(trn_ds2.data[ask_id][0].reshape(8, 8))
        trn_ds2.update(ask_id, lb)
        model.train(trn_ds2)
        E_out2 = np.append(E_out2, 1 - model.score(tst_ds))

        ax.set_xlim((0, i + 1))
        ax.set_ylim((0, max(max(E_out1), max(E_out2)) + 0.2))
        query_num = np.arange(0, i + 2)
        p1.set_xdata(query_num)
        p1.set_ydata(E_out1)
        p2.set_xdata(query_num)
        p2.set_ydata(E_out2)

        plt.draw()

    input("Press any key to continue...")
Exemplo n.º 28
0
def getQueryStrategy(query_strategy,
                     train_ds,
                     disagreement,
                     estimator_name=None):
    print('Initialize Query Strategy')
    # no committee but baseline query strategy
    if query_strategy == 'uncertainty':
        qs = UncertaintySampling(train_ds,
                                 method='lc',
                                 model=la.LogisticRegression_())
    # no committee but baseline query strategy
    elif query_strategy == 'random':
        qs = RandomSampling(train_ds)
    elif query_strategy == 'lr_lsvc_rf_dt':
        if disagreement == 'kl_divergence':
            raise ValueError(
                'when using kl_divergence lsvc cannot be in the committee as linearSVC does not provide predict_proba().\
                             Use svc instead or change disagreement to vote!')
        qs = QueryByCommittee(train_ds,
                              models=[
                                  la.RandomForest_(),
                                  la.DecisionTree_(),
                                  la.LogisticRegression_(solver='liblinear',
                                                         max_iter=1000),
                                  la.LinearSVC_()
                              ],
                              disagreement=disagreement)
    # committee with probabilistic models (SVC with prob=True used here instead of LinearSVC)
    elif query_strategy == 'lr_svc_rf_dt':
        qs = QueryByCommittee(train_ds,
                              models=[
                                  la.RandomForest_(),
                                  la.DecisionTree_(),
                                  la.LogisticRegression_(solver='liblinear',
                                                         max_iter=1000),
                                  la.SVC_(kernel='linear', probability=True)
                              ],
                              disagreement=disagreement)
    elif query_strategy == 'lr_svc_dt_xgb':
        qs = QueryByCommittee(
            train_ds,
            models=[
                la.LogisticRegression_(solver='liblinear', max_iter=1000),
                la.SVC_(kernel='linear', probability=True),
                la.DecisionTree_(),
                la.XGBClassifier_(objective="binary:logistic")
            ],
            disagreement=disagreement)
    # committee of five
    elif query_strategy == 'lr_svc_dt_xgb_rf':
        qs = QueryByCommittee(
            train_ds,
            models=[
                la.LogisticRegression_(solver='liblinear', max_iter=1000),
                la.SVC_(kernel='linear', probability=True),
                la.DecisionTree_(),
                la.XGBClassifier_(objective="binary:logistic"),
                la.RandomForest_()
            ],
            disagreement=disagreement)
    elif query_strategy == 'lr_lsvc_dt_gpc':
        if disagreement == 'kl_divergence':
            raise ValueError(
                'when using kl_divergence lsvc cannot be in the committee as linearSVC does not provide predict_proba().\
                             Use svc instead or change disagreement to vote!')
        qs = QueryByCommittee(train_ds,
                              models=[
                                  la.LogisticRegression_(solver='liblinear',
                                                         max_iter=1000),
                                  la.LinearSVC_(),
                                  la.DecisionTree_(),
                                  la.GaussianProcess_()
                              ],
                              disagreement=disagreement)
    elif query_strategy == 'lr_lsvc_dt_xgb':
        if disagreement == 'kl_divergence':
            raise ValueError(
                'when using kl_divergence lsvc cannot be in the committee as linearSVC does not provide predict_proba().\
                             Use svc instead or change disagreement to vote!')
        qs = QueryByCommittee(
            train_ds,
            models=[
                la.LogisticRegression_(solver='liblinear', max_iter=1000),
                la.LinearSVC_(),
                la.DecisionTree_(),
                la.XGBClassifier_(objective="binary:logistic")
            ],
            disagreement=disagreement)
    elif query_strategy == 'homogeneous_committee':
        committee = CommitteeModels(estimator_name)
        qs = QueryByCommittee(train_ds, models=committee.committee['models'])
    else:
        print("Query strategy not defined!")
        return None
    return qs
def main(args):

    acc_pool = []
    maxlen = 100

    # get the texts and their corresponding labels
    texts, labels = load_ptsd_data()

    # Keras example
    # # transform data into matrix of integers
    # tokenizer = Tokenizer()
    # tokenizer.fit_on_texts(texts)
    # sequences = tokenizer.texts_to_sequences(texts)
    # data = pad_sequences(sequences,
    #                      maxlen=maxlen,
    #                      padding='post', truncating='post')

    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfTransformer
    from libact.models import SklearnProbaAdapter, SklearnAdapter

    from sklearn.naive_bayes import MultinomialNB
    from sklearn.svm import SVC
    from sklearn.linear_model import LogisticRegression

    # count words
    count_vect = CountVectorizer(max_features=5000, stop_words='english')
    features = count_vect.fit_transform(texts).todense().tolist()
    
        
    # import pdb; pdb.set_trace()
    if 0:
        # tf-idf
        tfidf_transformer = TfidfTransformer()
        features = tfidf_transformer.fit_transform(features)
        

    pool, pool_ideal = make_pool(
        features, labels,
        prelabeled=[1, 2, 3, 4, 5, 218, 260, 466, 532, 564]
    )

    # get the model
    if args.model.lower() in ['multinomialnb', 'nb']:
        sklearn_model = MultinomialNB
        kwargs_model = {}
    elif args.model.lower() == 'svc':
        sklearn_model = SVC
        kwargs_model = {
            'probability': True,
            # 'class_weight': {0: 1, 1: 100}
            'class_weight': 'balanced' 
        }
    elif args.model.lower() == 'logisticregression':
        sklearn_model = LogisticRegression
        kwargs_model = {}
    else:
        raise ValueError('Model not found.')

    # initialize the model through the adapter
    model = SklearnProbaAdapter(sklearn_model(**kwargs_model))

    # query strategy
    # https://libact.readthedocs.io/en/latest/libact.query_strategies.html
    # #libact-query-strategies-uncertainty-sampling-module
    #
    # least confidence (lc), it queries the instance whose posterior
    # probability of being positive is nearest 0.5 (for binary
    # classification); smallest margin (sm), it queries the instance whose
    # posterior probability gap between the most and the second probable
    # labels is minimal
    qs = UncertaintySampling(
        pool, method='lc', model=SklearnProbaAdapter(sklearn_model(**kwargs_model)))

    # The passive learning model. The model given in the query strategy is not
    # the same. Have a look at this one.
    # model = LogisticRegression()

    fig, ax = plt.subplots()
    ax.set_xlabel('Number of Queries')
    ax.set_ylabel('Value')

    # Train the model on the train dataset.
    model.train(pool)

    # the accuracy of the entire pool
    acc_pool = np.append(
        acc_pool,
        model._model.score([x[0] for x in pool.get_entries()], labels)
    )

    # make plot
    query_num = np.arange(0, 1)
    p2, = ax.plot(query_num, acc_pool, 'r', label='Accuracy')
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True,
               shadow=True, ncol=5)
    plt.show(block=False)

    # Give each label its name (labels are from 0 to n_classes-1)
    if args.interactive:
        lbr = InteractivePaperLabeler(label_name=["0", "1"])
    else:
        lbr = IdealLabeler(dataset=pool_ideal)

    query_i = 1

    while query_i <= args.quota:

        # make a query from the pool
        print("Asking sample from pool with Uncertainty Sampling")
        ask_id = qs.make_query()
        print("Index {} returned. True label is {}.".format(
            ask_id, pool_ideal.data[ask_id][1]))

        # get the paper
        data_point = pool.data[ask_id][0]
        lb = lbr.label(data_point)

        # update the label in the train dataset
        pool.update(ask_id, lb)

        # train the model again
        model.train(pool)

        # append the score to the model
        acc_pool = np.append(
            acc_pool,
            model._model.score([x[0] for x in pool.get_entries()], labels)
        )

        # additional evaluations
        #pred = model.predict([x[0] for x in pool.get_entries()])
		
        idx_features = pool.get_unlabeled_entries()
        features = [x[1] for x in idx_features]
        idx= [x[0] for x in idx_features]
        pred = model.predict(features)

        print(confusion_matrix(labels[idx], pred))
        print(recall_score(labels[idx], pred))

        if args.interactive:
            # update plot
            ax.set_xlim((0, query_i))
            ax.set_ylim((0, max(acc_pool) + 0.2))
            p2.set_xdata(np.arange(0, query_i + 1))
            p2.set_ydata(acc_pool)
            plt.draw()

        # update the query counter
        query_i += 1

    if not args.interactive:
        # update plot
        ax.set_xlim((0, query_i - 1))
        ax.set_ylim((0, max(acc_pool) + 0.2))
        p2.set_xdata(np.arange(0, query_i))
        p2.set_ydata(acc_pool)
        plt.draw()

    print(acc_pool)

    input("Press any key to continue...")
Exemplo n.º 30
0
def main(args):
    pickle_file_name = args.dataset + '_pickle.pickle'
    pickle_file_path = os.path.join(TEMP_DATA_DIR, pickle_file_name)

    seed = 2018 * args.T
    if args.dataset == 'ptsd':
        texts, lbls = load_ptsd_data()
    else:
        texts, lbls = load_drug_data(args.dataset)

    # get the texts and their corresponding labels
    textManager = TextManager()
    data, labels, word_index = textManager.sequence_maker(texts, lbls)
    max_num_words = textManager.max_num_words
    max_sequence_length = textManager.max_sequence_length

    prelabeled_index = select_prelabeled(labels, args.init_included_papers,
                                         seed)
    # [1, 2, 3, 4, 5, 218, 260, 466, 532, 564]
    print('prelabeled_index', prelabeled_index)
    pool, pool_ideal = make_pool(data, labels, prelabeled=prelabeled_index)

    if os.path.isfile(pickle_file_path):
        embedding_layer = load_pickle(pickle_file_path)
    else:
        if not os.path.exists(TEMP_DATA_DIR):
            os.makedirs(TEMP_DATA_DIR)

        embedding = Word2VecEmbedding(word_index, max_num_words,
                                      max_sequence_length)
        embedding.load_word2vec_data(GLOVE_PATH)
        embedding_layer = embedding.build_embedding()
        dump_pickle(embedding_layer, pickle_file_path)
    # get the model
    if args.model.lower() == 'lstm':
        deep_model = LSTM_Libact
        kwargs_model = {
            'backwards': True,
            'dropout': 0.4,
            'optimizer': 'rmsprop',
            'max_sequence_length': max_sequence_length,
            'embedding_layer': embedding_layer
        }
    else:
        raise ValueError('Model not found.')

    model = deep_model(**kwargs_model)

    #     # query strategy
    #     # https://libact.readthedocs.io/en/latest/libact.query_strategies.html
    #     # #libact-query-strategies-uncertainty-sampling-module
    #     #
    #     # least confidence (lc), it queries the instance whose posterior
    #     # probability of being positive is nearest 0.5 (for binary
    #     # classification); smallest margin (sm), it queries the instance whose
    #     # posterior probability gap between the most and the second probable
    #     # labels is minimal
    #     qs = UncertaintySampling(
    #         pool, method='lc', model=SklearnProbaAdapter(sklearn_model(**kwargs_model)))

    #Todo: check if 'lc' works correctly/ add random as well
    qs = UncertaintySampling(pool,
                             method='lc',
                             model=deep_model(**kwargs_model))

    # Give each label its name (labels are from 0 to n_classes-1)
    if args.interactive:
        lbr = InteractivePaperLabeler(label_name=["0", "1"])
    else:
        lbr = IdealLabeler(dataset=pool_ideal)

    result_df = pd.DataFrame({'label': [x[1] for x in pool_ideal.data]})
    query_i = 1
    ##Todo: add multiple papers to labeled dataset with size of batch_size
    while query_i <= args.quota:

        # make a query from the pool
        print("Asking sample from pool with Uncertainty Sampling")
        # unlabeled_entry = pool.get_unlabeled_entries()

        ask_id = qs.make_query()
        print("Index {} returned. True label is {}.".format(
            ask_id, pool_ideal.data[ask_id][1]))

        # get the paper
        data_point = pool.data[ask_id][0]
        lb = lbr.label(data_point)

        # update the label in the train dataset
        pool.update(ask_id, lb)
        # train the model again
        # to_read_mean, to_read_std = cross_validation(model,pool,split_no=3,seed =query_i)
        model.train(pool)

        idx_features = pool.get_unlabeled_entries()
        idx = [x[0] for x in idx_features]
        features = [x[1] for x in idx_features]
        pred = model.predict(features)

        c_name = str(query_i)
        result_df[c_name] = -1
        result_df.loc[idx, c_name] = pred[:, 1]

        # update the query counter
        query_i += 1

    # save the result to a file
    output_dir = os.path.join(ACTIVE_DIR, args.dataset)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    export_path = os.path.join(output_dir,
                               'sr_lstm_active{}.csv'.format(args.T))

    result_df.to_csv(export_path)
    input("Press any key to continue...")
Exemplo n.º 31
0
    sents = sent_tokenize(line, language='russian')
    tokenized_texts.append(sents)
    for s in sents:
        vocab[s] = line

tfidf = TfidfVectorizer()
# create the vectorizer and get the X
x = tfidf.fit_transform(itertools.chain(*tokenized_texts))
# form the y by randomly fill the classes.
# to not do this, the option of labeling dataset from scratch is needed
# in Dataset class
y = np.array([0, 1, 0, 1, 0, 1, 0, 1] + [None] * (x.shape[0] - 8))
# Create the handfull Dataset object from libact
dataset = Dataset(x, y)
# Create strategy
qs = UncertaintySampling(dataset, method='lc', model=LogisticRegression())
# create list of sentences
texts = list(itertools.chain(*tokenized_texts))


@app.route('/')
def show_entries():
    # Ask the sample have to be lableled
    # and ranger the page for the first time
    ask_id = qs.make_query()
    session['ask_id'] = int(ask_id)
    return render_template('show.html', \
        text = vocab[texts[ask_id]],\
        sentence= texts[ask_id])