def make_query(self):

        tempDataset = copy.deepcopy(self.dataset)
        tempModel = copy.deepcopy(self.model)

        queryStrat = UncertaintySampling(tempDataset,
                                         model=tempModel)  #Model is fit here
        queryIds = []

        for j in range(self.batch_size_):
            queryId = queryStrat.make_query()  #Model is also fit here
            queryIds.append(queryId)

            features = tempDataset.get_entries()[queryId][0]

            probs = tempModel.predict_proba(features.reshape(1, -1))

            # hard coded flag for positive answer - need to improve
            if self.random_state_.rand() < probs[0][0]:
                label = 0
            else:
                label = 1

            tempDataset.update(queryId, label)

            # tempModel.train(tempDataset) #This is not needed,
            # since the make_query of UncertaintySampling fits

        return queryIds
예제 #2
0
    def libact_first_try_second_run(self, enriched_train_df, extractor,
                                    ideal_df, lbr, quota, validation_data_df,
                                    return_dict):

        trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor)
        qs = UncertaintySampling(trn_ds,
                                 method='lc',
                                 model=LogisticRegression())
        E_out1 = []
        E_out1 = np.append(
            E_out1,
            run_classifier(trn_ds.extract_labeled_dataframe(),
                           validation_data_df).f1)
        for i in range(quota):
            if len(trn_ds.get_unlabeled_entries()) == 0:
                break  # finished labeling all examples
            ask_id = qs.make_query()
            lb = lbr.label(trn_ds.extract_sentence(ask_id))
            self.assertEqual(lb, ideal_df[cn.tag_col][ask_id])
            trn_ds.update(ask_id, lb)
            # model.train(trn_ds)
            E_out1 = np.append(
                E_out1,
                run_classifier(trn_ds.extract_labeled_dataframe(),
                               validation_data_df).f1)
        return_dict[2] = E_out1
예제 #3
0
 def getUncertaintyIndex(self, trn_ds, method, clf):
     print "[Trainer-Selection] Get uncertainty sampling index."
     qs = UncertaintySampling(trn_ds, method=method, model=clf)
     _, score = qs.make_query(return_score=True)
     score_sorted = sorted(score, key=lambda x:x[1], reverse=True)
     result = []
     for index in score_sorted:
         result.append(self.unlabeled_index_[index[0]])
     return result
예제 #4
0
파일: label_digits.py 프로젝트: ckbjimmy/py
def main():
    quota = 10  # ask human to label 30 samples
    n_classes = 5
    E_out1, E_out2 = [], []

    trn_ds, tst_ds, ds = split_train_test(n_classes)
    trn_ds2 = copy.deepcopy(trn_ds)

    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())
    qs2 = RandomSampling(trn_ds2)

    model = LogisticRegression()

    fig = plt.figure()
    ax = fig.add_subplot(2, 1, 1)
    ax.set_xlabel('Number of Queries')
    ax.set_ylabel('Error')

    model.train(trn_ds)
    E_out1 = np.append(E_out1, 1 - model.score(tst_ds))
    model.train(trn_ds2)
    E_out2 = np.append(E_out2, 1 - model.score(tst_ds))

    query_num = np.arange(0, 1)
    p1, = ax.plot(query_num, E_out1, 'g', label='qs Eout')
    p2, = ax.plot(query_num, E_out2, 'k', label='random Eout')
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True,
               shadow=True, ncol=5)
    plt.show(block=False)

    img_ax = fig.add_subplot(2, 1, 2)
    box = img_ax.get_position()
    img_ax.set_position([box.x0, box.y0 - box.height * 0.1, box.width,
                         box.height * 0.9])
    # Give each label its name (labels are from 0 to n_classes-1)
    lbr = InteractiveLabeler(label_name=[str(lbl) for lbl in range(n_classes)])

    for i in range(quota):
        ask_id = qs.make_query()
        print("asking sample from Uncertainty Sampling")
        # reshape the image to its width and height
        lb = lbr.label(trn_ds.data[ask_id][0].reshape(8, 8))
        trn_ds.update(ask_id, lb)
        model.train(trn_ds)
        E_out1 = np.append(E_out1, 1 - model.score(tst_ds))

        ask_id = qs2.make_query()
        print("asking sample from Random Sample")
        lb = lbr.label(trn_ds2.data[ask_id][0].reshape(8, 8))
        trn_ds2.update(ask_id, lb)
        model.train(trn_ds2)
        E_out2 = np.append(E_out2, 1 - model.score(tst_ds))
예제 #5
0
class UncertaintySampler(object):
    def __init__(self, X, y, labs, n=2):

        y = [yy if yy >= 0 else None for yy in y]

        self.dataset = Dataset(X, y)
        self.labs = labs

        self.uc = UncertaintySampling(self.dataset,
                                      method='lc',
                                      model=LinearSVC())
        self.n = n

    def get_next(self):
        print >> sys.stderr, 'get_next: start'
        out = self.uc.make_query(n=self.n)
        print >> sys.stderr, 'get_next: done'
        return out

    def set_label(self, idx, label):
        print >> sys.stderr, 'set_label: start'
        out = self.dataset.update(idx, label)
        print >> sys.stderr, 'set_label: done'
        return out

    def get_data(self):
        X, y = zip(*self.dataset.get_entries())
        X, y = np.vstack(X), np.array(
            [yy if yy is not None else -1 for yy in y])
        return X, y

    def n_hits(self):
        labels = np.array(zip(*self.dataset.get_entries())[1])
        return (labels == 1).sum()

    def n_labeled(self):
        return self.dataset.len_labeled()

    def is_labeled(self, idx):
        return idx in np.where(zip(*self.dataset.get_entries())[1])[0]

    def save(self, outpath):
        """ !! This should be updated to save in same format as simple_las """
        X, y = self.get_data()

        f = h5py.File(
            '%s-%s-%s.h5' %
            (outpath, 'uncertainty', datetime.now().strftime('%Y%m%d_%H%M%S')))
        f['X'] = X
        f['y'] = y
        f['labs'] = self.labs
        f.close()
def libact_uncertainty(X, y, n_queries):
    y_train = np.array([None for _ in range(len(y))])
    y_train[0], y_train[50], y_train[100] = 0, 1, 2
    libact_train_dataset = Dataset(X, y_train)
    libact_full_dataset = Dataset(X, y)
    libact_learner = LogisticRegressionLibact(
        solver='liblinear', n_jobs=1,
        multi_class='ovr')  #SVM(gamma='auto', probability=True)
    libact_qs = UncertaintySampling(libact_train_dataset,
                                    model=libact_learner,
                                    method='lc')
    libact_labeler = IdealLabeler(libact_full_dataset)
    libact_learner.train(libact_train_dataset)

    for _ in range(n_queries):
        query_idx = libact_qs.make_query()
        query_label = libact_labeler.label(X[query_idx])
        libact_train_dataset.update(query_idx, query_label)
        libact_learner.train(libact_train_dataset)
예제 #7
0
def main():
    global pos_filepath, dataset_filepath, csv_filepath, vectors_list, ids_list
    dataset_filepath = "/Users/dndesign/Desktop/active_learning/vecteurs_et_infos/vectors_2015.txt"
    csv_filepath = "/Users/dndesign/Desktop/active_learning/donnees/corpus_2015_id-time-text.csv"
    pos_filepath = "/Users/dndesign/Desktop/active_learning/donnees/oriane_pos_id-time-text.csv"
    vectors_list, ids_list = get_vectors_list(dataset_filepath)

    timestr = time.strftime("%Y%m%d_%H%M%S")
    text_file = codecs.open("task_" + str(timestr) + ".txt", "w", "utf-8")

    print("Loading data...")
    text_file.write("Loading data...\n")
    # Open this file
    t0 = time.time()
    file = openfile_txt(dataset_filepath)
    num_lines = sum(1 for line in file)
    print("Treating " + str(num_lines) + " entries...")
    text_file.write("Treating : %s entries...\n" % str(num_lines))

    # Number of queries to ask human to label
    quota = 10
    E_out1, E_out2, E_out3, E_out4, E_out6, E_out7 = [], [], [], [], [], []
    trn_ds, tst_ds = split_train_test(csv_filepath)

    model = SVM(kernel='linear')
    # model = LogisticRegression()

    ''' UncertaintySampling (Least Confident)
     
        UncertaintySampling : it queries the instances about which 
        it is least certain how to label
        
        Least Confident : it queries the instance whose posterior 
        probability of being positive is nearest 0.5
    '''
    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression(C=.01))
    model.train(trn_ds)
    E_out1 = np.append(E_out1, 1 - model.score(tst_ds))

    ''' UncertaintySampling (Max Margin) 

    '''
    trn_ds2 = copy.deepcopy(trn_ds)
    qs2 = USampling(trn_ds2, method='mm', model=SVM(kernel='linear'))
    model.train(trn_ds2)
    E_out2 = np.append(E_out2, 1 - model.score(tst_ds))

    ''' CMB Sampling   
        Combination of active learning algorithms (distance-based (DIST), diversity-based (DIV)) 
    '''
    trn_ds3 = copy.deepcopy(trn_ds)
    qs3 = CMBSampling(trn_ds3, model=SVM(kernel='linear'))
    model.train(trn_ds3)
    E_out3 = np.append(E_out3, 1 - model.score(tst_ds))

    ''' Random Sampling   
        Random : it chooses randomly a query
    '''
    trn_ds4 = copy.deepcopy(trn_ds)
    qs4 = RandomSampling(trn_ds4, random_state=1126)
    model.train(trn_ds4)
    E_out4 = np.append(E_out4, 1 - model.score(tst_ds))

    ''' QueryByCommittee (Vote Entropy)
    
        QueryByCommittee : it keeps a committee of classifiers and queries 
        the instance that the committee members disagree, it  also examines 
        unlabeled examples and selects only those that are most informative 
        for labeling
        
        Vote Entropy : a way of measuring disagreement 
        
        Disadvantage : it does not consider the committee members’ class 
        distributions. It also misses some informative unlabeled examples 
        to label 
    '''
    trn_ds6 = copy.deepcopy(trn_ds)
    qs6 = QueryByCommittee(trn_ds6, disagreement='vote',
                              models=[LogisticRegression(C=1.0),
                                      LogisticRegression(C=0.01),
                                      LogisticRegression(C=100)],
                              random_state=1126)
    model.train(trn_ds6)
    E_out6 = np.append(E_out6, 1 - model.score(tst_ds))

    ''' QueryByCommittee (Kullback-Leibler Divergence)
    
            QueryByCommittee : it examines unlabeled examples and selects only 
            those that are most informative for labeling
            
            Disadvantage :  it misses some examples on which committee members 
            disagree
    '''
    trn_ds7 = copy.deepcopy(trn_ds)
    qs7 = QueryByCommittee(trn_ds7, disagreement='kl_divergence',
                                  models=[LogisticRegression(C=1.0),
                                          LogisticRegression(C=0.01),
                                          LogisticRegression(C=100)],
                                  random_state=1126)
    model.train(trn_ds7)
    E_out7 = np.append(E_out7, 1 - model.score(tst_ds))

    with sns.axes_style("darkgrid"):
        fig = plt.figure()
        ax = fig.add_subplot(1, 1, 1)

    query_num = np.arange(0, 1)
    p1, = ax.plot(query_num, E_out1, 'red')
    p2, = ax.plot(query_num, E_out2, 'blue')
    p3, = ax.plot(query_num, E_out3, 'green')
    p4, = ax.plot(query_num, E_out4, 'orange')
    p6, = ax.plot(query_num, E_out6, 'black')
    p7, = ax.plot(query_num, E_out7, 'purple')
    plt.legend(('Least Confident', 'Max Margin', 'Distance Diversity CMB', 'Random Sampling', 'Vote Entropy', 'KL Divergence'), loc=1)
    plt.ylabel('Accuracy')
    plt.xlabel('Number of Queries')
    plt.title('Active Learning - Query choice strategies')
    plt.ylim([0, 1])
    plt.show(block=False)

    for i in range(quota):
        print("\n#################################################")
        print("Query number " + str(i) + " : ")
        print("#################################################\n")
        text_file.write("\n#################################################\n")
        text_file.write("Query number %s : " % str(i))
        text_file.write("\n#################################################\n")

        ask_id = qs.make_query()
        print("\033[4mUsing Uncertainty Sampling (Least confident) :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using Uncertainty Sampling (Least confident) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds)
        E_out1 = np.append(E_out1, 1 - model.score(tst_ds))

        ask_id = qs2.make_query()
        print("\033[4mUsing Uncertainty Sampling (Max Margin) :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using Uncertainty Sampling (Smallest Margin) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds2.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds2)
        E_out2 = np.append(E_out2, 1 - model.score(tst_ds))

        ask_id = qs3.make_query()
        print("\033[4mUsing CMB Distance-Diversity Sampling :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using Uncertainty Sampling (Entropy) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds3.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds3)
        E_out3 = np.append(E_out3, 1 - model.score(tst_ds))

        ask_id = qs4.make_query()
        print("\033[4mUsing Random Sampling :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using Random Sampling :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds4.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds4)
        E_out4 = np.append(E_out4, 1 - model.score(tst_ds))

        ask_id = qs6.make_query()
        print("\033[4mUsing QueryByCommittee (Vote Entropy) :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using QueryByCommittee (Vote Entropy) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds6.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds6)
        E_out6 = np.append(E_out6, 1 - model.score(tst_ds))

        ask_id = qs7.make_query()
        print("\033[4mUsing QueryByCommittee (KL Divergence) :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using QueryByCommittee (KL Divergence) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds7.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds7)
        E_out7 = np.append(E_out7, 1 - model.score(tst_ds))

        ax.set_xlim((0, i + 1))
        ax.set_ylim((0, max(max(E_out1), max(E_out2), max(E_out3), max(E_out4), max(E_out6), max(E_out7)) + 0.2))
        query_num = np.arange(0, i + 2)
        p1.set_xdata(query_num)
        p1.set_ydata(E_out1)
        p2.set_xdata(query_num)
        p2.set_ydata(E_out2)
        p3.set_xdata(query_num)
        p3.set_ydata(E_out3)
        p4.set_xdata(query_num)
        p4.set_ydata(E_out4)
        p6.set_xdata(query_num)
        p6.set_ydata(E_out6)
        p7.set_xdata(query_num)
        p7.set_ydata(E_out7)

        plt.draw()

    t2 = time.time()
    time_total = t2 - t0
    print("\n\n\n#################################################\n")
    print("Execution time : %fs \n\n" % time_total)
    text_file.write("\n\n\n#################################################\n")
    text_file.write("Execution time : %fs \n" % time_total)
    text_file.close()
    input("Press any key to save the plot...")
    plt.savefig('task_' + str(timestr) + '.png')

    print("Done")
예제 #8
0
def main():
    quota = 10  # ask human to label 10 samples
    n_classes = 5
    E_out1, E_out2 = [], []

    trn_ds, tst_ds, ds = split_train_test(n_classes)
    trn_ds2 = copy.deepcopy(trn_ds)
    # print(trn_ds.get_entries())
    # print(len(trn_ds))
    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())
    qs2 = RandomSampling(trn_ds2)

    model = LogisticRegression()

    fig = plt.figure()
    ax = fig.add_subplot(2, 1, 1)
    ax.set_xlabel('Number of Queries')
    ax.set_ylabel('Error')

    model.train(trn_ds)
    E_out1 = np.append(E_out1, 1 - model.score(tst_ds))
    model.train(trn_ds2)
    E_out2 = np.append(E_out2, 1 - model.score(tst_ds))

    query_num = np.arange(0, 1)
    p1, = ax.plot(query_num, E_out1, 'g', label='qs Eout')
    p2, = ax.plot(query_num, E_out2, 'k', label='random Eout')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               fancybox=True,
               shadow=True,
               ncol=5)
    plt.show(block=False)

    img_ax = fig.add_subplot(2, 1, 2)
    box = img_ax.get_position()
    img_ax.set_position(
        [box.x0, box.y0 - box.height * 0.1, box.width, box.height * 0.9])
    # Give each label its name (labels are from 0 to n_classes-1)
    lbr = InteractiveLabeler(label_name=[str(lbl) for lbl in range(n_classes)])

    for i in range(quota):
        ask_id = qs.make_query()
        print("asking sample from Uncertainty Sampling")
        # reshape the image to its width and height
        lb = lbr.label(trn_ds.data[ask_id][0].reshape(8, 8))
        trn_ds.update(ask_id, lb)
        model.train(trn_ds)
        E_out1 = np.append(E_out1, 1 - model.score(tst_ds))

        ask_id = qs2.make_query()
        print("asking sample from Random Sample")
        lb = lbr.label(trn_ds2.data[ask_id][0].reshape(8, 8))
        trn_ds2.update(ask_id, lb)
        model.train(trn_ds2)
        E_out2 = np.append(E_out2, 1 - model.score(tst_ds))

        ax.set_xlim((0, i + 1))
        ax.set_ylim((0, max(max(E_out1), max(E_out2)) + 0.2))
        query_num = np.arange(0, i + 2)
        p1.set_xdata(query_num)
        p1.set_ydata(E_out1)
        p2.set_xdata(query_num)
        p2.set_ydata(E_out2)

        plt.draw()

    input("Press any key to continue...")
예제 #9
0
def main(args):
    pickle_file_name = args.dataset + '_pickle.pickle'
    pickle_file_path = os.path.join(TEMP_DATA_DIR, pickle_file_name)

    seed = 2018 * args.T
    if args.dataset == 'ptsd':
        texts, lbls = load_ptsd_data()
    else:
        texts, lbls = load_drug_data(args.dataset)

    # get the texts and their corresponding labels
    textManager = TextManager()
    data, labels, word_index = textManager.sequence_maker(texts, lbls)
    max_num_words = textManager.max_num_words
    max_sequence_length = textManager.max_sequence_length

    prelabeled_index = select_prelabeled(labels, args.init_included_papers,
                                         seed)
    # [1, 2, 3, 4, 5, 218, 260, 466, 532, 564]
    print('prelabeled_index', prelabeled_index)
    pool, pool_ideal = make_pool(data, labels, prelabeled=prelabeled_index)

    if os.path.isfile(pickle_file_path):
        embedding_layer = load_pickle(pickle_file_path)
    else:
        if not os.path.exists(TEMP_DATA_DIR):
            os.makedirs(TEMP_DATA_DIR)

        embedding = Word2VecEmbedding(word_index, max_num_words,
                                      max_sequence_length)
        embedding.load_word2vec_data(GLOVE_PATH)
        embedding_layer = embedding.build_embedding()
        dump_pickle(embedding_layer, pickle_file_path)
    # get the model
    if args.model.lower() == 'lstm':
        deep_model = LSTM_Libact
        kwargs_model = {
            'backwards': True,
            'dropout': 0.4,
            'optimizer': 'rmsprop',
            'max_sequence_length': max_sequence_length,
            'embedding_layer': embedding_layer
        }
    else:
        raise ValueError('Model not found.')

    model = deep_model(**kwargs_model)

    #     # query strategy
    #     # https://libact.readthedocs.io/en/latest/libact.query_strategies.html
    #     # #libact-query-strategies-uncertainty-sampling-module
    #     #
    #     # least confidence (lc), it queries the instance whose posterior
    #     # probability of being positive is nearest 0.5 (for binary
    #     # classification); smallest margin (sm), it queries the instance whose
    #     # posterior probability gap between the most and the second probable
    #     # labels is minimal
    #     qs = UncertaintySampling(
    #         pool, method='lc', model=SklearnProbaAdapter(sklearn_model(**kwargs_model)))

    #Todo: check if 'lc' works correctly/ add random as well
    qs = UncertaintySampling(pool,
                             method='lc',
                             model=deep_model(**kwargs_model))

    # Give each label its name (labels are from 0 to n_classes-1)
    if args.interactive:
        lbr = InteractivePaperLabeler(label_name=["0", "1"])
    else:
        lbr = IdealLabeler(dataset=pool_ideal)

    result_df = pd.DataFrame({'label': [x[1] for x in pool_ideal.data]})
    query_i = 1
    ##Todo: add multiple papers to labeled dataset with size of batch_size
    while query_i <= args.quota:

        # make a query from the pool
        print("Asking sample from pool with Uncertainty Sampling")
        # unlabeled_entry = pool.get_unlabeled_entries()

        ask_id = qs.make_query()
        print("Index {} returned. True label is {}.".format(
            ask_id, pool_ideal.data[ask_id][1]))

        # get the paper
        data_point = pool.data[ask_id][0]
        lb = lbr.label(data_point)

        # update the label in the train dataset
        pool.update(ask_id, lb)
        # train the model again
        # to_read_mean, to_read_std = cross_validation(model,pool,split_no=3,seed =query_i)
        model.train(pool)

        idx_features = pool.get_unlabeled_entries()
        idx = [x[0] for x in idx_features]
        features = [x[1] for x in idx_features]
        pred = model.predict(features)

        c_name = str(query_i)
        result_df[c_name] = -1
        result_df.loc[idx, c_name] = pred[:, 1]

        # update the query counter
        query_i += 1

    # save the result to a file
    output_dir = os.path.join(ACTIVE_DIR, args.dataset)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    export_path = os.path.join(output_dir,
                               'sr_lstm_active{}.csv'.format(args.T))

    result_df.to_csv(export_path)
    input("Press any key to continue...")
def main(args):

    acc_pool = []
    maxlen = 100

    # get the texts and their corresponding labels
    texts, labels = load_ptsd_data()

    # Keras example
    # # transform data into matrix of integers
    # tokenizer = Tokenizer()
    # tokenizer.fit_on_texts(texts)
    # sequences = tokenizer.texts_to_sequences(texts)
    # data = pad_sequences(sequences,
    #                      maxlen=maxlen,
    #                      padding='post', truncating='post')

    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfTransformer
    from libact.models import SklearnProbaAdapter, SklearnAdapter

    from sklearn.naive_bayes import MultinomialNB
    from sklearn.svm import SVC
    from sklearn.linear_model import LogisticRegression

    # count words
    count_vect = CountVectorizer(max_features=5000, stop_words='english')
    features = count_vect.fit_transform(texts).todense().tolist()
    
        
    # import pdb; pdb.set_trace()
    if 0:
        # tf-idf
        tfidf_transformer = TfidfTransformer()
        features = tfidf_transformer.fit_transform(features)
        

    pool, pool_ideal = make_pool(
        features, labels,
        prelabeled=[1, 2, 3, 4, 5, 218, 260, 466, 532, 564]
    )

    # get the model
    if args.model.lower() in ['multinomialnb', 'nb']:
        sklearn_model = MultinomialNB
        kwargs_model = {}
    elif args.model.lower() == 'svc':
        sklearn_model = SVC
        kwargs_model = {
            'probability': True,
            # 'class_weight': {0: 1, 1: 100}
            'class_weight': 'balanced' 
        }
    elif args.model.lower() == 'logisticregression':
        sklearn_model = LogisticRegression
        kwargs_model = {}
    else:
        raise ValueError('Model not found.')

    # initialize the model through the adapter
    model = SklearnProbaAdapter(sklearn_model(**kwargs_model))

    # query strategy
    # https://libact.readthedocs.io/en/latest/libact.query_strategies.html
    # #libact-query-strategies-uncertainty-sampling-module
    #
    # least confidence (lc), it queries the instance whose posterior
    # probability of being positive is nearest 0.5 (for binary
    # classification); smallest margin (sm), it queries the instance whose
    # posterior probability gap between the most and the second probable
    # labels is minimal
    qs = UncertaintySampling(
        pool, method='lc', model=SklearnProbaAdapter(sklearn_model(**kwargs_model)))

    # The passive learning model. The model given in the query strategy is not
    # the same. Have a look at this one.
    # model = LogisticRegression()

    fig, ax = plt.subplots()
    ax.set_xlabel('Number of Queries')
    ax.set_ylabel('Value')

    # Train the model on the train dataset.
    model.train(pool)

    # the accuracy of the entire pool
    acc_pool = np.append(
        acc_pool,
        model._model.score([x[0] for x in pool.get_entries()], labels)
    )

    # make plot
    query_num = np.arange(0, 1)
    p2, = ax.plot(query_num, acc_pool, 'r', label='Accuracy')
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True,
               shadow=True, ncol=5)
    plt.show(block=False)

    # Give each label its name (labels are from 0 to n_classes-1)
    if args.interactive:
        lbr = InteractivePaperLabeler(label_name=["0", "1"])
    else:
        lbr = IdealLabeler(dataset=pool_ideal)

    query_i = 1

    while query_i <= args.quota:

        # make a query from the pool
        print("Asking sample from pool with Uncertainty Sampling")
        ask_id = qs.make_query()
        print("Index {} returned. True label is {}.".format(
            ask_id, pool_ideal.data[ask_id][1]))

        # get the paper
        data_point = pool.data[ask_id][0]
        lb = lbr.label(data_point)

        # update the label in the train dataset
        pool.update(ask_id, lb)

        # train the model again
        model.train(pool)

        # append the score to the model
        acc_pool = np.append(
            acc_pool,
            model._model.score([x[0] for x in pool.get_entries()], labels)
        )

        # additional evaluations
        #pred = model.predict([x[0] for x in pool.get_entries()])
		
        idx_features = pool.get_unlabeled_entries()
        features = [x[1] for x in idx_features]
        idx= [x[0] for x in idx_features]
        pred = model.predict(features)

        print(confusion_matrix(labels[idx], pred))
        print(recall_score(labels[idx], pred))

        if args.interactive:
            # update plot
            ax.set_xlim((0, query_i))
            ax.set_ylim((0, max(acc_pool) + 0.2))
            p2.set_xdata(np.arange(0, query_i + 1))
            p2.set_ydata(acc_pool)
            plt.draw()

        # update the query counter
        query_i += 1

    if not args.interactive:
        # update plot
        ax.set_xlim((0, query_i - 1))
        ax.set_ylim((0, max(acc_pool) + 0.2))
        p2.set_xdata(np.arange(0, query_i))
        p2.set_ydata(acc_pool)
        plt.draw()

    print(acc_pool)

    input("Press any key to continue...")
def main(args):

    acc_reviewer, acc_train, acc_test = [], [], []

    trn_ds, tst_ds, y_train = split_train_test()

    # query strategy
    # https://libact.readthedocs.io/en/latest/libact.query_strategies.html
    # #libact-query-strategies-uncertainty-sampling-module
    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())

    # The passive learning model. The model given in the query strategy is not
    # the same. Have a look at this one.
    model = LogisticRegression()

    fig = plt.figure()
    ax = fig.add_subplot(2, 1, 1)
    ax.set_xlabel('Number of Queries')
    ax.set_ylabel('Error')

    oracle = y_train[get_indices_labeled_entries(trn_ds)]
    review = [label for feat, label in trn_ds.get_labeled_entries()]
    reviewer_acc = accuracy_score(oracle, review)

    # Train the model on the train dataset.
    # Append the score (error).
    model.train(trn_ds)
    acc_reviewer = np.append(acc_reviewer, reviewer_acc)
    acc_train = np.append(
        acc_train,
        model.model.score([x[0] for x in trn_ds.get_entries()], y_train))
    acc_test = np.append(acc_test, model.score(tst_ds))

    query_num = np.arange(0, 1)
    p0, = ax.plot(query_num, acc_reviewer, 'g', label='Acc reviewer')
    p1, = ax.plot(query_num, acc_reviewer, 'b', label='Acc train')
    p2, = ax.plot(query_num, acc_test, 'r', label='Acc test')

    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               fancybox=True,
               shadow=True,
               ncol=5)
    plt.show(block=False)

    img_ax = fig.add_subplot(2, 1, 2)
    box = img_ax.get_position()
    img_ax.set_position(
        [box.x0, box.y0 - box.height * 0.1, box.width, box.height * 0.9])

    # Give each label its name (labels are from 0 to n_classes-1)
    lbr = InteractiveLabeler(label_name=["0", "1"])
    # lbr = InteractivePaperLabeler(label_name=["0", "1"])

    for i in range(args.quota):

        # make a query from the pool
        ask_id = qs.make_query()
        print("asking sample from Uncertainty Sampling")

        # reshape the image to its width and height
        data_point = trn_ds.data[ask_id][0].reshape(8, 8)
        lb = lbr.label(data_point)

        # update the label in the train dataset
        trn_ds.update(ask_id, lb)

        # train the model again
        model.train(trn_ds)

        # compute accuracy of the reviewer
        oracle = y_train[get_indices_labeled_entries(trn_ds)]
        review = [label for feat, label in trn_ds.get_labeled_entries()]
        reviewer_acc = accuracy_score(oracle, review)

        # append the score to the model
        acc_reviewer = np.append(acc_reviewer, reviewer_acc)
        acc_train = np.append(
            acc_train,
            model.model.score([x[0] for x in trn_ds.get_entries()], y_train))
        acc_test = np.append(acc_test, model.score(tst_ds))

        # adjust the limits of the axes
        ax.set_xlim((0, i + 1))
        ax.set_ylim((0, max(acc_test) + 0.2))

        query_num = np.arange(0, i + 2)
        p0.set_xdata(query_num)
        p0.set_ydata(acc_reviewer)
        p1.set_xdata(query_num)
        p1.set_ydata(acc_train)
        p2.set_xdata(query_num)
        p2.set_ydata(acc_test)

        plt.draw()

    input("Press any key to continue...")