Python ActiveLearner.teach 예제들, modAL.models.ActiveLearner.teach Python 예제들

예제 #1

0

파일 보기

파일: train_models.py 프로젝트: ADockhorn/Active-Forward-Model-Learning

def _train_using_static_data_set(sampler, data_set, evaluation_steps):
    x_train = data_set[:, :-1]
    y_train = data_set[:, -1]

    learner = ActiveLearner(
        estimator=RandomForestClassifier(),
        query_strategy=samplers[sampler],
    )

    tmp_x_train, tmp_y_train = x_train.copy(), y_train.copy()

    queried_points = 0
    training_results = {"models": []}

    for step in trange(len(evaluation_steps), disable=disable_tqdm, desc=f"{sampler}-{data_set_name}"):
        query_idx, query_inst = learner.query(tmp_x_train, n_instances=evaluation_steps[step]-queried_points)

        # ...obtaining new labels from the pool...
        learner.teach(query_inst, tmp_y_train[query_idx])
        queried_points += evaluation_steps[step] - queried_points

        tmp_x_train = np.delete(tmp_x_train, query_idx, axis=0)
        tmp_y_train = np.delete(tmp_y_train, query_idx, axis=0)

        lfm = DecisionTreeClassifier().fit(learner.X_training, learner.y_training)
        training_results["models"].append(lfm)

    return training_results

예제 #2

0

파일 보기

    def al_pool_margin(self, data, target, X_train, y_train, X_full, y_full,
                       train_idx):
        acc = []
        X_pool = np.delete(data, train_idx, axis=0)
        y_pool = np.delete(target, train_idx)
        learner = ActiveLearner(estimator=RandomForestClassifier(),
                                query_strategy=margin_sampling,
                                X_training=X_train,
                                y_training=y_train)

        n_queries = self.query_number
        # n_queries = 1500
        for idx in range(n_queries):
            query_idx, query_instance = learner.query(X_pool)
            learner.teach(X=X_pool[query_idx].reshape(1, -1),
                          y=y_pool[query_idx].reshape(1, ))
            # remove queried instance from pool
            X_pool = np.delete(X_pool, query_idx, axis=0)
            y_pool = np.delete(y_pool, query_idx)
            learner_score = learner.score(data, target)
            # print('Accuracy after query no. %d: %f' % (idx + 1, learner_wscore))
            precision, recall, fscore, support = self.performance_measure(
                learner, X_full, y_full)
            learner_score = fscore
            acc.append(learner_score)
            print('%0.3f' % (learner_score), end=",")
        return acc

예제 #3

0

파일 보기

    def al_pool(self, data, target, X_train, y_train, X_full, y_full, train_idx):
        acc = []
        X_pool = np.delete(data, train_idx, axis=0)
        y_pool = np.delete(target, train_idx)
        learner = ActiveLearner(
            estimator=RandomForestClassifier(),
            X_training=X_train, y_training=y_train
        )

        n_queries = self.query_number
        # n_queries = 1500
        for idx in range(n_queries):
            query_idx, query_instance = learner.query(X_pool)
            learner.teach(
                X=X_pool[query_idx].reshape(1, -1),
                y=y_pool[query_idx].reshape(1, )
            )
            # remove queried instance from pool
            X_pool = np.delete(X_pool, query_idx, axis=0)
            y_pool = np.delete(y_pool, query_idx)
            learner_score = learner.score(data, target)

            # learner.estimator
            # print('Accuracy after query no. %d: %f' % (idx + 1, learner_wscore))
            X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.30)
            y_predict = learner.predict(X_test)
            precision, recall, fscore, support = score(y_test, y_predict)
            acc.append(learner_score)
            print('%0.3f' % (learner_score), end=",")
        return acc

예제 #4

0

파일 보기

    def al_stream(self, data, target, X_train, y_train, X_full, y_full,
                  train_idx):
        # initializing the active learner
        acc = []
        learner = ActiveLearner(estimator=RandomForestClassifier(),
                                query_strategy=margin_sampling,
                                X_training=X_train,
                                y_training=y_train)

        # print('Initial prediction accuracy: %f' % learner.score(X_full, y_full))
        index = 0
        # learning until the accuracy reaches a given threshold
        while learner.score(X_full, y_full) < 0.90:
            stream_idx = np.random.choice(range(len(X_full)))
            if classifier_uncertainty(learner, X_full[stream_idx].reshape(
                    1, -1)) >= 0.2:
                learner.teach(X_full[stream_idx].reshape(1, -1),
                              y_full[stream_idx].reshape(-1, ))
                learner_score = learner.score(X_full, y_full)
                # print('Item no. %d queried, new accuracy: %f' % (stream_idx, learner_score))
                print('%0.3f' % (learner_score), end=",")
                if index == self.query_number:
                    break
                index = index + 1
                acc.append(learner_score)
        return acc

예제 #5

0

파일 보기

def active_learning_procedure(query_strategy,
                              test_X,
                              test_y,
                              pool_X,
                              pool_y,
                              initial_X,
                              initial_y,
                              estimator,
                              epochs=50,
                              batch_size=128,
                              n_queries=100,
                              n_instances=10,
                              verbose=0):
    learner = ActiveLearner(estimator=estimator,
                            X_training=initial_X,
                            y_training=initial_y,
                            query_strategy=query_strategy,
                            verbose=verbose)
    perf_hist = [learner.score(test_X, test_y, verbose=verbose)]
    for index in range(n_queries):
        query_idx, query_instance = learner.query(pool_X, n_instances)
        learner.teach(pool_X[query_idx],
                      pool_y[query_idx],
                      epochs=epochs,
                      batch_size=batch_size,
                      verbose=verbose)
        pool_X = np.delete(pool_X, query_idx, axis=0)
        pool_y = np.delete(pool_y, query_idx, axis=0)
        model_accuracy = learner.score(test_X, test_y, verbose=0)
        print("accuracy after query {n}: {acc:0.4f".format(n=index + 1,
                                                           acc=model_accuracy))
        perf_hist.append(model_accuracy)
    return perf_hist

예제 #6

0

파일 보기

def al_Loop(estimator, X_train, Y_train, X, Y, X_test, Y_test, indexs):
    learner = ActiveLearner(estimator=estimator,
                            X_training=X_train,
                            y_training=Y_train)
    X_pool = np.delete(X, indexs, axis=0)
    Y_pool = np.delete(Y, indexs, axis=0)
    index = 0

    accuracy = 0
    while len(X_pool) > 0:
        query_index, _ = learner.query(X_pool)
        x, y = X_pool[query_index].reshape(1, -1), Y_pool[query_index].reshape(
            1, )
        learner.teach(X=x, y=y)
        X_pool, Y_pool = np.delete(X_pool, query_index,
                                   axis=0), np.delete(Y_pool, query_index)
        model_accuracy = 1 - learner.score(X_pool, Y_pool)

        print('Error after query {n}: {acc:0.4f}'.format(n=index + 1,
                                                         acc=model_accuracy))
        accuracy = model_accuracy
        predicts = learner.predict(X_test)
        corrects = (predicts == Y_test)
        accs = (sum([1 if i else 0 for i in corrects]) / len(predicts))
        accs = 1 - accs
        print(accs)
        index += 1
    return learner

예제 #7

0

파일 보기

    def uncertainty_values(self, data, target, X_train, y_train, X_full,
                           y_full, train_idx):
        print("START: ST")
        # initializing the active learner
        learner = ActiveLearner(estimator=RandomForestClassifier(),
                                query_strategy=margin_sampling,
                                X_training=X_train,
                                y_training=y_train)
        print('%f' % learner.score(X_full, y_full))
        index = 0
        # learning until the accuracy reaches a given threshold
        while learner.score(X_full, y_full) < 0.90:
            stream_idx = np.random.choice(range(len(X_full)))
            if classifier_uncertainty(learner, X_full[stream_idx].reshape(
                    1, -1)) >= 0.4:

                print("[ %1.3f, %1.3f]" %
                      (classifier_uncertainty(
                          learner, X_full[stream_idx].reshape(1, -1))[0],
                       classifier_margin(learner, X_full[stream_idx].reshape(
                           1, -1))[0]))

                learner.teach(X_full[stream_idx].reshape(1, -1),
                              y_full[stream_idx].reshape(-1, ))
                learner_score = learner.score(X_full, y_full)
                # print('Item no. %d queried, new accuracy: %f' % (stream_idx, learner_score))
                # print('%f' % (learner_score))
                if index == 50:
                    break
                index = index + 1
        print("START: ST")

예제 #8

0

파일 보기

    def learn(self):
        # seeding
        classes = self.short_df['grades_round'].unique()
        seed_index = []
        for i in classes:
            seed_index.append(self.short_df['grades_round'][
                self.short_df['grades_round'] == i].index[0])
        seed_index

        act_data = self.short_df.copy()
        accuracy_list = []
        f1_total_list = []
        kappa_total_list = []

        # initialising
        train_idx = seed_index
        X_train = self.X[train_idx]
        y_train = self.Y[train_idx]

        # generating the pool
        X_pool = np.delete(self.X, train_idx, axis=0)
        y_pool = np.delete(self.Y, train_idx)

        act_data = act_data.drop(axis=0, index=train_idx)
        act_data.reset_index(drop=True, inplace=True)

        # initializing the active learner

        learner = ActiveLearner(estimator=self.model,
                                X_training=X_train,
                                y_training=y_train,
                                query_strategy=self.query_method)

        # pool-based sampling
        n_queries = int(len(X) / (100 / self.percent))
        for idx in range(n_queries):
            query_idx, query_instance = learner.query(X_pool)
            learner.teach(X=X_pool[query_idx].reshape(1, -1),
                          y=y_pool[query_idx].reshape(1, ))

            # remove queried instance from pool
            X_pool = np.delete(X_pool, query_idx, axis=0)
            y_pool = np.delete(y_pool, query_idx)

            act_data = act_data.drop(axis=0, index=query_idx)
            act_data.reset_index(drop=True, inplace=True)

            accuracy_list.append(learner.score(X_pool, y_pool))
            model_pred = learner.predict(X_pool)
            f1_total_list.append(
                f1_score(y_pool,
                         model_pred,
                         average="weighted",
                         labels=np.unique(model_pred)))
            kappa_total_list.append(cohen_kappa_score(y_pool, model_pred))


#             print('Accuracy after query no. %d: %f' % (idx+1, learner.score(X_pool, y_pool)))
# print("By just labelling ",round(n_queries*100.0/len(X),2),"% of total data accuracy of ", round(learner.score(X_pool, y_pool),3), " % is achieved on the unseen data" )
        return accuracy_list, f1_total_list, kappa_total_list

예제 #9

0

파일 보기

파일: train_models.py 프로젝트: ADockhorn/Active-Forward-Model-Learning

def _train_using_dynamic_data_set(sampler, data_set, evaluation_steps):
    learner = ActiveLearner(
        estimator=RandomForestClassifier(),
        query_strategy=samplers[sampler],
    )

    queried_points = 0
    training_results = {"models": []}

    tmp_x = []
    tmp_y = []

    for data_index in trange(len(data_set), disable=disable_tqdm, desc=f"{sampler}-{data_set_name}"):
        x_train = data_set[data_index][:, :-1]
        y_train = data_set[data_index][:, -1]

        query_idx, query_inst = learner.query(x_train, n_instances=1)
        tmp_x.append(query_inst)
        tmp_y.append(y_train[query_idx])
        queried_points += 1

        if data_index+1 in evaluation_steps:
            learner.teach(np.array(tmp_x).reshape((len(tmp_x),-1)), np.array(tmp_y).flatten())
            tmp_x = []
            tmp_y = []

            lfm = DecisionTreeClassifier().fit(learner.X_training, learner.y_training)
            training_results["models"].append(lfm)

    return training_results

예제 #10

0

파일 보기

파일: t3.py 프로젝트: ncgamit/ALforASAG

    def learn(self):
        # seeding
        classes = self.short_df['grades_round'].unique()
        seed_index = []
        for i in classes:
            seed_index.append(self.short_df['grades_round'][
                self.short_df['grades_round'] == i].index[0])
        seed_index

        act_data = self.short_df.copy()
        accuracy_list = []
        f1_total_list = []
        kappa_total_list = []

        # initialising
        train_idx = seed_index
        X_train = self.X[train_idx]
        y_train = self.Y[train_idx]

        # generating the pool
        X_pool = np.delete(self.X, train_idx, axis=0)
        y_pool = np.delete(self.Y, train_idx)

        act_data = act_data.drop(axis=0, index=train_idx)
        act_data.reset_index(drop=True, inplace=True)

        # initializing the random learner
        learner = ActiveLearner(
            estimator=self.model,
            X_training=X_train,
            y_training=y_train,
        )

        # pool-based sampling
        n_queries = int(len(X) / (100 / self.percent))
        for idx in range(n_queries):
            query_idx = np.random.choice(range(len(X_pool)))
            learner.teach(X=X_pool[query_idx].reshape(1, -1),
                          y=y_pool[query_idx].reshape(1, ))

            # remove queried instance from pool
            X_pool = np.delete(X_pool, query_idx, axis=0)
            y_pool = np.delete(y_pool, query_idx)

            act_data = act_data.drop(axis=0, index=query_idx)
            act_data.reset_index(drop=True, inplace=True)

            accuracy_list.append(learner.score(X_pool, y_pool))

            model_pred = learner.predict(X_pool)
            f1_total_list.append(
                f1_score(y_pool,
                         model_pred,
                         average="weighted",
                         labels=np.unique(model_pred)))
            kappa_total_list.append(cohen_kappa_score(y_pool, model_pred))
        return accuracy_list, f1_total_list, kappa_total_list

예제 #11

0

파일 보기

파일: runtime_comparison.py 프로젝트: ronak-44/Active_Learning

def modAL_uncertainty(X, y, n_queries):
    modAL_learner = ActiveLearner(LogisticRegression(solver='liblinear',
                                                     n_jobs=1,
                                                     multi_class='ovr'),
                                  X_training=X[[0, 50, 100]],
                                  y_training=y[[0, 50, 100]])

    for _ in range(n_queries):
        query_idx, query_inst = modAL_learner.query(X)
        modAL_learner.teach(X[query_idx], y[query_idx])

예제 #12

0

파일 보기

파일: random_active_learning.py 프로젝트: georgSquared/AdvancedML-Auth

def run(X_initial, y_initial, n_samples_for_initial, n_queries, estimator):
    np.random.seed(0)
    start_time = time.time()
    # Isolate our examples for our labeled dataset.
    n_labeled_examples = X_initial.shape[0]
    training_indices = np.random.randint(low=0, high=n_labeled_examples + 1, size=n_samples_for_initial)

    X_train = X_initial[training_indices, :]
    y_train = y_initial[training_indices]

    # Isolate the non-training examples we'll be querying.
    X_pool = delete_rows_csr(X_initial, training_indices)
    y_pool = np.delete(y_initial, training_indices)
    # Pre-set our batch sampling to retrieve 3 samples at a time.
    BATCH_SIZE = 3
    preset_batch = partial(uncertainty_batch_sampling, n_instances=BATCH_SIZE)

    # Specify our active learning model.
    learner = ActiveLearner(
        estimator=estimator,
        X_training=X_train,
        y_training=y_train,
        query_strategy=preset_batch
    )

    initial_accuracy = learner.score(X_initial, y_initial)
    print("Initial Accuracy: ", initial_accuracy)
    performance_history = [initial_accuracy]

    f1_score = 0
    index = 0
    while f1_score < 0.65:
        index += 1
        query_index = np.random.choice(y_pool.shape[0], size=1, replace=False)

        # Teach our ActiveLearner model the random record it has been sampled.
        X, y = X_pool[query_index, :], y_pool[query_index]
        learner.teach(X=X, y=y)

        # Remove the queried instance from the unlabeled pool.
        X_pool = delete_rows_csr(X_pool, query_index)
        y_pool = np.delete(y_pool, query_index)

        # Calculate and report our model's f1_score.
        y_pred = learner.predict(X_initial)
        f1_score = metrics.f1_score(y_initial, y_pred, average='micro')

        if index % 100 == 0:
            print('F1 score after {n} training samples: {f1:0.4f}'.format(n=index, f1=f1_score))

        # Save our model's performance for plotting.
        performance_history.append(f1_score)

    print("--- %s seconds ---" % (time.time() - start_time))
    return index

예제 #13

0

파일 보기

파일: runtime_comparison.py 프로젝트: ronak-44/Active_Learning

def modAL_EER(X, y, n_queries):
    modAL_learner = ActiveLearner(LogisticRegression(solver='liblinear',
                                                     n_jobs=1,
                                                     multi_class='ovr'),
                                  query_strategy=expected_error_reduction,
                                  X_training=X[[0, 50, 100]],
                                  y_training=y[[0, 50, 100]])

    for _ in range(n_queries):
        query_idx, query_inst = modAL_learner.query(X)
        modAL_learner.teach(X[query_idx], y[query_idx])

예제 #14

0

파일 보기

    def al_pool_proba(self, data, target, X_train, y_train, X_full, y_full, train_idx, classifier, sampling_strategy, proba):
        acc = []
        pre = []
        rec = []
        fs = []
        X_pool = np.delete(data, train_idx, axis=0)
        y_pool = np.delete(target, train_idx)
        learner = ActiveLearner(
            estimator=classifier,
            query_strategy=sampling_strategy,
            X_training=X_train, y_training=y_train
        )

        n_queries = self.query_number
        # n_queries = 1500
        for idx in range(n_queries):
            query_idx, query_instance = learner.query(X_pool)

            labeled_y = y_pool[query_idx].reshape(1, )
            rand_int = randint(0, 100)
            if(rand_int <= proba):
                if( y_pool[query_idx][0] == 1):
                    y_pool[query_idx][0] = 0
                    labeled_y = np.array((0)).reshape(1,)
                else:
                    y_pool[query_idx][0] = 1
                    labeled_y = np.array((1)).reshape(1, )

            learner.teach(
                X=X_pool[query_idx].reshape(1, -1),
                y=labeled_y
            )
            # remove queried instance from pool
            X_pool = np.delete(X_pool, query_idx, axis=0)
            y_pool = np.delete(y_pool, query_idx)
            learner_score = learner.score(data, target)
            # print('Accuracy after query no. %d: %f' % (idx + 1, learner_wscore))
            precision, recall, fscore, support, accuracy = self.performance_measure(learner, X_full, y_full)
            # learner_score = fscore
            acc.append(accuracy)
            pre.append(precision)
            rec.append(recall)
            fs.append(fscore)
            print('%0.3f' % (learner_score), end=",")
        return acc, pre, rec, fs

예제 #15

0

파일 보기

def al_pool(data, target, X_train, y_train, X_full, y_full, train_idx):
    X_pool = np.delete(data, train_idx, axis=0)
    y_pool = np.delete(target, train_idx)
    learner = ActiveLearner(estimator=RandomForestClassifier(),
                            X_training=X_train[:200],
                            y_training=y_train[:200])

    n_queries = 1500
    for idx in range(n_queries):
        query_idx, query_instance = learner.query(X_pool)
        learner.teach(X=X_pool[query_idx].reshape(1, -1),
                      y=y_pool[query_idx].reshape(1, ))
        # remove queried instance from pool
        X_pool = np.delete(X_pool, query_idx, axis=0)
        y_pool = np.delete(y_pool, query_idx)
        learner_score = learner.score(data, target)
        # print('Accuracy after query no. %d: %f' % (idx + 1, learner_wscore))
        print('%0.3f' % (learner_score), end=",")

예제 #16

0

파일 보기

파일: active_learning.py 프로젝트: aashaygarg/ActiveLearning

def active_learner(query_stra, N_query):
  knn = KNeighborsClassifier(n_neighbors=8)
  learner = ActiveLearner(estimator=knn, X_training=X_train, y_training=y_train, query_strategy=query_stra)

  predictions = learner.predict(X_test)

  X_pool = X_test.values
  y_pool = y_test.values

  for index in range(N_query):
    query_index, query_instance = learner.query(X_pool)
    X, y = X_pool[query_index].reshape(1, -1), y_pool[query_index].reshape(1, )
    learner.teach(X=X, y=y)
    X_pool, y_pool = np.delete(X_pool, query_index, axis=0), np.delete(y_pool, query_index)
    
  model_accuracy = learner.score(X_test, y_test)
  print('Accuracy: {acc:0.4f} \n'.format(acc=model_accuracy))
  performance_history.append(model_accuracy)

예제 #17

0

파일 보기

파일: pipeline_old.py 프로젝트: lucasmessias3/LMV-MsC

 def createLearner(self, method, x, y, learner, first):
     if method in [
             "Entropy Sampling", "Margin Sampling", "Uncertainty Sampling",
             "Average Confidence", "RDS", "MST-BE"
     ]:
         if first:
             if method == "Entropy Sampling":
                 learner = ActiveLearner(estimator=RandomForestClassifier(),
                                         query_strategy=entropy_sampling,
                                         X_training=x,
                                         y_training=y.astype(int))
             if method == "Margin Sampling":
                 learner = ActiveLearner(estimator=RandomForestClassifier(),
                                         query_strategy=margin_sampling,
                                         X_training=x,
                                         y_training=y.astype(int))
             if method == "Uncertainty Sampling":
                 learner = ActiveLearner(
                     estimator=RandomForestClassifier(),
                     query_strategy=uncertainty_sampling,
                     X_training=x,
                     y_training=y.astype(int))
             if method == "Average Confidence":
                 learner = ActiveLearner(estimator=RandomForestClassifier(),
                                         query_strategy=avg_confidence,
                                         X_training=x,
                                         y_training=y.astype(int))
             if method == "RDS":
                 learner = ActiveLearner(
                     estimator=RandomForestClassifier(),
                     query_strategy=root_distance_based_selection_strategy,
                     X_training=x,
                     y_training=y.astype(int))
             if method == "MST-BE":
                 learner = ActiveLearner(
                     estimator=RandomForestClassifier(),
                     query_strategy=disagree_labels_edges_idx_query_strategy,
                     X_training=x,
                     y_training=y.astype(int))
         else:
             learner.teach(x, y.astype(int))
     else:
         learner = "none"
     return learner

예제 #18

0

파일 보기

파일: active_learning_modAL_exclude_one_waterloo_1.py 프로젝트: feamster/internet_video

def run_model(X, y, test_size, rep_times, n_queries, estimator, fd):
    performance_history = [[] for i in range(n_queries)]
    for i in range(rep_times):
        # print('exp:', i)
        # print('exp:', i, file=fd)

        n_labled_examples = X.shape[0]
        X_trn_all, X_tst, y_trn_all, y_tst = train_test_split(
            X, y, test_size=test_size, stratify=y)
        X_trn_all = X_trn_all[:, 1:]
        y_tst = X_tst[:, 0]
        X_tst = X_tst[:, 1:]
        y_tst = y_tst.astype('int32')

        X_trn_min, y_trn_min, X_trn, y_trn = get_init_train(
            X_trn_all, y_trn_all)
        # print('ground truth:', y_tst, file=fd)

        learner = ActiveLearner(estimator=estimator,
                                X_training=X_trn_min,
                                y_training=y_trn_min)

        # prediction with no query
        predictions_0 = learner.predict(X_tst)
        err_0 = error_calculation(predictions_0, y_tst)

        for j in range(n_queries):
            query_index, query_instance = learner.query(X_trn)
            X_qry, y_qry = X_trn[query_index].reshape(
                1, -1), y_trn[query_index].reshape(1, )
            learner.teach(X=X_qry, y=y_qry)
            X_trn, y_trn = np.delete(X_trn, query_index,
                                     axis=0), np.delete(y_trn, query_index)
            predictions = learner.predict(X_tst)
            err = error_calculation(predictions, y_tst)
            performance_history[j].append(err)

    avg_err = []
    sd = []
    for i in range(n_queries):
        avg_err.append(np.mean(performance_history[i]))
        sd.append(np.std(performance_history[i]) / np.sqrt(rep_times))

    return avg_err, sd

예제 #19

0

파일 보기

파일: rf-500-samples-on-unsw.py 프로젝트: forhadmethun/data-analysis

def active_learn(df1, first_item_index_of_each_category):
    train_idx = first_item_index_of_each_category

    data = df1.values[:, 1:]
    target = df1['label'].values

    X_full = df1.values[:, 1:]
    y_full = df1['label'].values

    X_train = df1.values[:, 1:][
        train_idx]  #item from second column as the first column is the label..
    y_train = df1['label'].values[train_idx]

    X_pool = np.delete(data, train_idx, axis=0)
    y_pool = np.delete(target, train_idx)

    for i in range(1001, 1500):
        learner = ActiveLearner(estimator=RandomForestClassifier(),
                                X_training=X_train[:i],
                                y_training=y_train[:i])
        print('Initial prediction accuracy: %f' %
              learner.score(X_full, y_full))
    print("================================")
    print("================================")
    print("================================")
    print("================================")
    print("================================")
    learner = ActiveLearner(estimator=RandomForestClassifier(),
                            X_training=X_train[:1001],
                            y_training=y_train[:1001])

    n_queries = 502
    performance_array = []
    for idx in range(n_queries):
        query_idx, query_instance = learner.query(X_pool)
        learner.teach(X=X_pool[query_idx].reshape(1, -1),
                      y=y_pool[query_idx].reshape(1, ))
        # remove queried instance from pool
        X_pool = np.delete(X_pool, query_idx, axis=0)
        y_pool = np.delete(y_pool, query_idx)
        learner_score = learner.score(data, target)
        # print('Accuracy after query no. %d: %f' % (idx + 1, learner_score))
        print('%f' % (learner_score))

예제 #20

0

파일 보기

def run_exp_music(intup):
    global X_train, X_test, y_train, y_test
    rep, i, p = intup

    X_seed, X_pool = X_train[:n_seed], X_train[n_seed:]
    y_seed, y_pool = y_train[:n_seed], y_train[n_seed:]

    # Initializing the learner
    learner = ActiveLearner(
        estimator=RandomForestClassifier(n_estimators=10),
        query_strategy=entropy_sampling,
        X_training=X_seed, y_training=y_seed
    )

    # Run active learning and record history of test accuracy
    history = np.zeros(query_budget - n_seed)
    for j in range(query_budget - n_seed):
        query_idx, query_inst = learner.query(X_pool)
        learner.teach(X_pool[query_idx], y_pool[query_idx])
        history[j] = learner.score(X_test, y_test)
    return history

예제 #21

0

파일 보기

파일: active_learning_modAL_per_user_model.py 프로젝트: feamster/internet_video

def run_model(X, y, test_size, rep_times, n_queries, estimator, fd):
    performance_history = [[] for i in range(n_queries)]
    for i in range(rep_times):
        print('exp:', i)
        # print('exp:', i, file=fd)
        
        n_labled_examples = X.shape[0]
        X_trn_all, X_tst, y_trn_all, y_tst = train_test_split(X, y, test_size=test_size, stratify=y)
        # get initial training set, which size = n_class
        X_trn_min, y_trn_min, X_trn, y_trn = get_init_train(X_trn_all, y_trn_all)
        # print('ground truth:', y_tst, file=f_2)

        learner = ActiveLearner(estimator=estimator, X_training=X_trn_min, y_training=y_trn_min)

        # prediction with no query
        predictions_0 = learner.predict(X_tst)
        err_0 = error_calculation(predictions_0, y_tst)

        # print('query no.', 0, file=f_2)
        # print('predictions:', predictions_0, file=f_2)
        # print('MSE:', err_0, file=f_2)

        for j in range(n_queries):
            query_index, query_instance = learner.query(X_trn)
            X_qry, y_qry = X_trn[query_index].reshape(1, -1), y_trn[query_index].reshape(1, )
            learner.teach(X=X_qry, y=y_qry)
            X_trn, y_trn = np.delete(X_trn, query_index, axis=0), np.delete(y_trn, query_index)
            predictions = learner.predict(X_tst)
            err = error_calculation(predictions, y_tst)
            # print('query no.', j+1, file=f_2)
            # print('predictions:', predictions, file=f_2)
            # print('MSE:', err, file=f_2)
            performance_history[j].append(err)

    avg_err = []
    for i in range(n_queries):
        avg_err.append(np.mean(performance_history[i]))

    return avg_err

예제 #22

0

파일 보기

def run_exp(intup):
    global X_train, X_test, y_train, y_test
    rep, i, p = intup

    # Make noisy data, simulate pool-based case
    X_train_noisy = utils.add_gaussian_noise(X_train, p)
    y_train_noisy = y_train  # utils.flip_labels(y_train, p)
    X_seed, X_pool = X_train_noisy[:n_seed], X_train_noisy[n_seed:]
    y_seed, y_pool = y_train_noisy[:n_seed], y_train_noisy[n_seed:]

    # Initializing the learner
    learner = ActiveLearner(
        estimator=RandomForestClassifier(n_estimators=10),
        query_strategy=entropy_sampling,
        X_training=X_seed, y_training=y_seed
    )

    # Run active learning and record history of test accuracy
    history = np.zeros(query_budget - n_seed)
    for j in range(query_budget - n_seed):
        query_idx, query_inst = learner.query(X_pool)
        learner.teach(X_pool[query_idx], y_pool[query_idx])
        history[j] = learner.score(X_test, y_test)
    return history

예제 #23

0

파일 보기

파일: BaseClassifier.py 프로젝트: naesseth/platipus

class ActiveLearningClassifier:
    """Base machine learning classifier using active learning with modAL package

    Attributes:
        amine:              A string representing the amine that the Logistic Regression model is used for predictions.
        config:             A dictionary representing the hyper-parameters of the model
        metrics:            A dictionary to store the performance metrics locally. It has the format of
                                {'metric_name': [metric_value]}.
        verbose:            A boolean representing whether it will prints out additional information to the terminal
                                or not.
        stats_path:         A Path object representing the directory of the stats dictionary if we are not running
                                multi-processing.
        result_dict:        A dictionary representing the result dictionary used during multi-thread processing.
        classifier_name:    A string representing the name of the generic classifier.
        model_name:         A string representing the name of the specific model for future plotting.
        all_data:           A numpy array representing all the data from the dataset.
        all_labels:         A numpy array representing all the labels from the dataset.
        x_t:                A numpy array representing the training data used for model training.
        y_t:                A numpy array representing the training labels used for model training.
        x_v:                A numpy array representing the testing data used for active learning.
        y_v:                A numpy array representing the testing labels used for active learning.
        learner:            An ActiveLearner to conduct active learning with. See modAL documentation for more details.
    """
    def __init__(self,
                 amine=None,
                 config=None,
                 verbose=True,
                 stats_path=None,
                 result_dict=None,
                 classifier_name='Base Classifier',
                 model_name='Base Classifier'):
        """initialization of the class"""
        self.amine = amine

        self.config = config

        self.metrics = defaultdict(dict)
        self.verbose = verbose
        self.stats_path = stats_path
        self.result_dict = result_dict
        self.classifier_name = classifier_name
        self.model_name = model_name

    def load_dataset(self, set_id, x_t, y_t, x_v, y_v, all_data, all_labels):
        """Load the input training and validation data and labels into the model.

        Args:
            set_id:             An integer representing the id of the random draw that we are loading.
            x_t:                A 2-D numpy array representing the training data.
            y_t:                A 2-D numpy array representing the training labels.
            x_v:                A 2-D numpy array representing the validation data.
            y_v:                A 2-D numpy array representing the validation labels.
            all_data:           A 2-D numpy array representing all the data in the active learning pool.
            all_labels:         A 2-D numpy array representing all the labels in the active learning pool.
        """
        self.draw_id = set_id
        self.metrics[self.draw_id] = defaultdict(list)

        self.x_t, self.y_t, self.x_v, self.y_v = x_t, y_t, x_v, y_v

        self.all_data = all_data
        self.all_labels = all_labels

        if self.verbose:
            print(f'The training data has dimension of {self.x_t.shape}.')
            print(f'The training labels has dimension of {self.y_t.shape}.')
            print(f'The testing data has dimension of {self.x_v.shape}.')
            print(f'The testing labels has dimension of {self.y_v.shape}.')

    def train(self, warning=True):
        """Train the KNN model by setting up the ActiveLearner."""

        self.learner = ActiveLearner(estimator=self.model,
                                     X_training=self.x_t,
                                     y_training=self.y_t)
        # Evaluate zero-point performance
        self.evaluate(warning=warning)

    def active_learning(self, num_iter=None, warning=True):
        """The active learning loop

        This is the active learning model that loops around the decision tree model
        to look for the most uncertain point and give the model the label to train

        Args:
            num_iter:   An integer that is the number of iterations.
                        Default = None
            warning:    A boolean that decide if to declare zero division warning or not.
                        Default = True.
        """

        num_iter = num_iter if num_iter else self.x_v.shape[0]

        for _ in range(num_iter):
            # Query the most uncertain point from the active learning pool
            query_index, query_instance = self.learner.query(self.x_v)

            # Teach our ActiveLearner model the record it has requested.
            uncertain_data, uncertain_label = self.x_v[query_index].reshape(
                1, -1), self.y_v[query_index].reshape(1, )
            self.learner.teach(X=uncertain_data, y=uncertain_label)

            self.evaluate(warning=warning)

            # Remove the queried instance from the unlabeled pool.
            self.x_t = np.append(self.x_t, uncertain_data).reshape(
                -1, self.all_data.shape[1])
            self.y_t = np.append(self.y_t, uncertain_label)
            self.x_v = np.delete(self.x_v, query_index, axis=0)
            self.y_v = np.delete(self.y_v, query_index)

    def evaluate(self, warning=True, store=True):
        """Evaluation of the model

        Args:
            warning:    A boolean that decides if to warn about the zero division issue or not.
                            Default = True
            store:      A boolean that decides if to store the metrics of the performance of the model.
                            Default = True
        """

        # Calculate and report our model's accuracy.
        accuracy = self.learner.score(self.all_data, self.all_labels)

        self.y_preds = self.learner.predict(self.all_data)

        cm = confusion_matrix(self.all_labels, self.y_preds)

        # To prevent nan value for precision, we set it to 1 and send out a warning message
        if cm[1][1] + cm[0][1] != 0:
            precision = cm[1][1] / (cm[1][1] + cm[0][1])
        else:
            precision = 1.0
            if warning:
                print('WARNING: zero division during precision calculation')

        recall = cm[1][1] / (cm[1][1] + cm[1][0])
        true_negative = cm[0][0] / (cm[0][0] + cm[0][1])
        bcr = 0.5 * (recall + true_negative)

        if store:
            self.store_metrics_to_model(cm, accuracy, precision, recall, bcr)

    def store_metrics_to_model(self, cm, accuracy, precision, recall, bcr):
        """Store the performance metrics

        The metrics are specifically the confusion matrices, accuracies,
        precisions, recalls and balanced classification rates.

        Args:
            cm:             A numpy array representing the confusion matrix given our predicted labels and the actual
                                corresponding labels. It's a 2x2 matrix for the drp_chem model.
            accuracy:       A float representing the accuracy rate of the model: the rate of correctly predicted
                                reactions out of all reactions.
            precision:      A float representing the precision rate of the model: the rate of the number of actually
                                successful reactions out of all the reactions predicted to be successful.
            recall:         A float representing the recall rate of the model: the rate of the number of reactions
                                predicted to be successful out of all the actual successful reactions.
            bcr:            A float representing the balanced classification rate of the model. It's the average value
                                of recall rate and true negative rate.
        """

        self.metrics[self.draw_id]['confusion_matrices'].append(cm)
        self.metrics[self.draw_id]['accuracies'].append(accuracy)
        self.metrics[self.draw_id]['precisions'].append(precision)
        self.metrics[self.draw_id]['recalls'].append(recall)
        self.metrics[self.draw_id]['bcrs'].append(bcr)

        if self.verbose:
            print(cm)
            print('accuracy for model is', accuracy)
            print('precision for model is', precision)
            print('recall for model is', recall)
            print('balanced classification rate for model is', bcr)

    def find_inner_avg(self):
        """Find the average across all random draws"""
        metric_names = ['accuracies', 'precisions', 'recalls', 'bcrs']
        rand_draws = list(self.metrics.keys())

        for metric in metric_names:
            lst_of_metrics = []
            for set_id in rand_draws:
                lst_of_metrics.append(self.metrics[set_id][metric])
            self.metrics['average'][metric] = list(
                np.average(lst_of_metrics, axis=0))

        lst_of_confusion_matrices = []
        for set_id in rand_draws:
            lst_of_confusion_matrices.append(
                self.metrics[set_id]['confusion_matrices'])
        self.metrics['average'][
            'confusion_matrices'] = lst_of_confusion_matrices

    def store_metrics_to_file(self):
        """Store the metrics results to the model's parameters dictionary

        Use the same logic of saving the metrics for each model.
        Dump the cross validation statistics to a pickle file.
        """
        self.find_inner_avg()

        model = self.model_name

        # Check if we are running multi-thread process
        # Or single-thread process
        if self.result_dict:
            # Store to the existing multi-processing dictionary
            stats_dict = self.result_dict
        else:
            # Store to a simple dictionary
            if self.stats_path.exists():
                with open(self.stats_path, "rb") as f:
                    stats_dict = pickle.load(f)
            else:
                stats_dict = {}

        if model not in stats_dict:
            stats_dict[model] = defaultdict(list)

        stats_dict[model]['amine'].append(self.amine)
        stats_dict[model]['accuracies'].append(
            self.metrics['average']['accuracies'])
        stats_dict[model]['confusion_matrices'].append(
            self.metrics['average']['confusion_matrices'])
        stats_dict[model]['precisions'].append(
            self.metrics['average']['precisions'])
        stats_dict[model]['recalls'].append(self.metrics['average']['recalls'])
        stats_dict[model]['bcrs'].append(self.metrics['average']['bcrs'])

        # Save this dictionary in case we need it later
        if not self.result_dict and self.stats_path:
            with open(self.stats_path, "wb") as f:
                pickle.dump(stats_dict, f)

    def save_model(self):
        """Save the data used to train, validate and test the model to designated folder"""

        # Set up the main destination folder for the model
        dst_root = './data/{}/{}'.format(self.classifier_name, self.model_name)
        if not os.path.exists(dst_root):
            os.makedirs(dst_root)
            print(
                f'No folder for {self.classifier_name} model {self.model_name} storage found'
            )
            print(f'Make folder to store model at')

        # Dump the model into the designated folder
        file_name = "{0:s}_{1:s}.pkl".format(self.model_name, self.amine)
        with open(os.path.join(dst_root, file_name), "wb") as f:
            pickle.dump(self, f)

예제 #24

0

파일 보기

# initial training data: 100 random pixels
initial_idx = np.random.choice(range(len(X_pool)), size=100)

# initializing the learners
n_learners = 3
learner_list = []
for _ in range(n_learners):
    learner = ActiveLearner(
        estimator=RandomForestClassifier(),
        X_training=X_pool[initial_idx], y_training=y_pool[initial_idx],
        bootstrap_init=True
    )
    learner_list.append(learner)

# assembling the Committee
committee = Committee(learner_list)

# ensemble active learner from the Committee
ensemble_learner = ActiveLearner(
    estimator=committee
)

query_idx, query_instance = ensemble_learner.query(X_pool)

# ...
# ... obtain label from the Oracle ...
# ...

ensemble_learner.teach(X_pool[query_idx], y_pool[query_idx], bootstrap=True)

예제 #25

0

파일 보기

파일: moe naive.py 프로젝트: Shariar076/Enseble_of_Learners

        opinions.append(query_idx)
        learner_id += 1
    opt_idx = np.random.choice(range(len(opinions)),
                               p=weights,
                               size=1,
                               replace=True)[0]
    # print("selected strategy: ", opt_idx)
    strategy_count[opt_idx] += 1
    x.append(opt_idx)
    selected_idx = opinions[opt_idx]
    # print(opinions)
    print("selected Index: ", selected_idx)
    for learner in learners:
        if opt_idx == 1:
            learner.teach(X=X_pool[selected_idx], y=y_pool[selected_idx])
        else:
            learner.teach(X=X_pool[selected_idx].reshape(1, -1),
                          y=y_pool[selected_idx].reshape(1, ))
    X_pool = np.delete(X_pool, selected_idx, axis=0)
    y_pool = np.delete(y_pool, selected_idx)
    euc_density = np.delete(euc_density, selected_idx)

    print("loop: ", _)
    performance_history.append(learners[opt_idx].score(train_features,
                                                       train_labels))
    # if _> 100:
    del_acc = performance_history[-1] - performance_history[-2]
    weights[opt_idx] += del_acc
    if weights[opt_idx] < 0:
        weights[opt_idx] = 0

예제 #26

0

파일 보기

파일: stream-based-clustering-entropy-sampling-on-unsw.py 프로젝트: forhadmethun/data-analysis

def active_learn(df1, first_item_index_of_each_category):
    train_idx = first_item_index_of_each_category
    # X_train = iris['data'][train_idx]
    # y_train = iris['target'][train_idx]

    # initial training data
    data = df1.values[:, 1:]
    target = df1['label'].values

    X_full = df1.values[:, 1:]
    y_full = df1['label'].values

    X_train = df1.values[:, 1:][
        train_idx]  #item from second column as the first column is the label..
    y_train = df1['label'].values[train_idx]

    # with plt.style.context('seaborn-white'):
    #     pca = PCA(n_components=2).fit_transform(data)
    #     plt.figure(figsize=(7, 7))
    #     plt.scatter(x=pca[:, 0], y=pca[:, 1], c=y_train, cmap='viridis', s=50)
    #     plt.title('The iris dataset')
    #     plt.show()

    # generating the pool
    X_pool = np.delete(data, train_idx, axis=0)
    y_pool = np.delete(target, train_idx)

    # initializing the active learner
    learner = ActiveLearner(estimator=RandomForestClassifier(),
                            query_strategy=entropy_sampling,
                            X_training=X_train,
                            y_training=y_train)

    # print('Initial prediction accuracy: %f' % learner.score(X_full, y_full))
    print('%f' % learner.score(X_full, y_full))
    index = 0
    performance_array = []
    # learning until the accuracy reaches a given threshold
    while learner.score(X_full, y_full) < 0.90:
        stream_idx = np.random.choice(range(len(X_full)))
        if classifier_uncertainty(learner, X_full[stream_idx].reshape(
                1, -1)) >= 0.4:
            learner.teach(X_full[stream_idx].reshape(1, -1),
                          y_full[stream_idx].reshape(-1, ))
            learner_score = learner.score(X_full, y_full)
            # print('Item no. %d queried, new accuracy: %f' % (stream_idx, learner_score))
            print('%f' % (learner_score))
            if index == 505:
                break
            if (index % 100 == 0):
                performance_array.append(learner_score)
            index = index + 1
    percentage_increase(performance_array)

    # visualizing initial prediction
    # with plt.style.context('seaborn-white'):
    #     plt.figure(figsize=(7, 7))
    #     prediction = learner.predict(data)
    #     plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50)
    #     plt.title('Initial accuracy: %f' % learner.score(data, target))
    #     plt.show()

    # pool-based sampling
    # n_queries = 502
    # performance_array = []
    # for idx in range(n_queries):
    #     query_idx, query_instance = learner.query(X_pool)
    #     learner.teach(
    #         X=X_pool[query_idx].reshape(1, -1),
    #         y=y_pool[query_idx].reshape(1, )
    #     )
    #     # remove queried instance from pool
    #     X_pool = np.delete(X_pool, query_idx, axis=0)
    #     y_pool = np.delete(y_pool, query_idx)
    #     learner_score = learner.score(data, target)
    #     print('Accuracy after query no. %d: %f' % (idx + 1, learner_score))
    #     if (idx % 100 == 0):
    #         performance_array.append(learner_score)
    #
    # percentage_increase(performance_array)

    # plotting final prediction
    # with plt.style.context('seaborn-white'):
    #     plt.figure(figsize=(7, 7))
    #     prediction = learner.predict(data)
    #     plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50)
    #     plt.title(
    #         'Classification accuracy after %i queries: %f' % (n_queries, learner.score(data,target)))
    #     plt.show()
    y = 0

예제 #27

0

파일 보기

파일: ensemble.py 프로젝트: zhuwenxiao/modAL

# initial training data: 100 random pixels
initial_idx = np.random.choice(range(len(X_pool)), size=100)

# initializing the learners
n_learners = 3
learner_list = []
for _ in range(n_learners):
    learner = ActiveLearner(
        predictor=RandomForestClassifier(),
        X_initial=X_pool[initial_idx], y_initial=y_pool[initial_idx],
        bootstrap_init=True
    )
    learner_list.append(learner)

# assembling the Committee
committee = Committee(learner_list)

# ensemble active learner from the Committee
ensemble_learner = ActiveLearner(
    predictor=committee
)

query_idx, query_instance = ensemble_learner.query(X_pool)

# ...
# ... obtain label from the Oracle ...
# ...

ensemble_learner.teach(X_pool[query_idx], y_pool[query_idx], bootstrap=True)

예제 #28

0

파일 보기

파일: active_regression.py 프로젝트: zhuwenxiao/modAL

    X_initial=X_initial.reshape(-1, 1), y_initial=y_initial.reshape(-1, 1)
)

# plotting the initial estimation
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(14, 7))
    x = np.linspace(0, 20, 1000)
    pred, std = regressor.predict(x.reshape(-1,1), return_std=True)
    plt.plot(x, pred)
    plt.fill_between(x, pred.reshape(-1, )-std, pred.reshape(-1, )+std, alpha=0.2)
    plt.scatter(X, y, c='k')
    plt.title('Initial estimation based on %d points' % n_initial)
    plt.show()

# active learning
n_queries = 10
for idx in range(n_queries):
    query_idx, query_instance = regressor.query(X)
    regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))

# plotting after active learning
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(14, 7))
    x = np.linspace(0, 20, 1000)
    pred, std = regressor.predict(x.reshape(-1,1), return_std=True)
    plt.plot(x, pred)
    plt.fill_between(x, pred.reshape(-1, )-std, pred.reshape(-1, )+std, alpha=0.2)
    plt.scatter(X, y, c='k')
    plt.title('Estimation after %d queries' % n_queries)
    plt.show()

예제 #29

0

파일 보기

def activeLearning(method, X_train, Y_train, X_test, Y_test, K):

    interations = 101
    random.seed(0)

    # Define initial labels indexs to train classifier
    if method in ["RDS", "MST-BE"]:
        idx, root_idx, X_initial, Y_initial, X_pool, Y_pool = activeLearningLib_Object.get_samples(
            X_train,
            Y_train,
            n_clusters=int(len(np.unique(Y_train)) * 2),
            strategy=method)
        labeled_idx = np.empty(0, int)
    else:
        idx = np.asarray(random.sample(range(0, len(X_train)), k=K))
        X_initial, Y_initial = X_train[idx], Y_train[idx]
        X_pool, Y_pool = np.delete(X_train, idx, axis=0), np.delete(Y_train,
                                                                    idx,
                                                                    axis=0)

    # Initialize Active Learning Methods
    t = time.time()
    if method == "Entropy Sampling":
        learner = ActiveLearner(estimator=SVC(probability=True),
                                query_strategy=entropy_sampling,
                                X_training=X_initial,
                                y_training=Y_initial)
    elif method == "Margin Sampling":
        learner = ActiveLearner(estimator=SVC(probability=True),
                                query_strategy=margin_sampling,
                                X_training=X_initial,
                                y_training=Y_initial)
    elif method == "Uncertainty Sampling":
        learner = ActiveLearner(estimator=SVC(probability=True),
                                query_strategy=uncertainty_sampling,
                                X_training=X_initial,
                                y_training=Y_initial)
    elif method == "Average Confidence":
        learner = ActiveLearner(estimator=SVC(probability=True),
                                query_strategy=avg_confidence,
                                X_training=X_initial,
                                y_training=Y_initial)
    elif method == "RDS":
        learner = ActiveLearner(
            estimator=SVC(probability=True),
            # estimator = SupervisedOPF(distance = "log_squared_euclidean", pre_computed_distance = None),
            query_strategy=root_distance_based_selection_strategy,
            X_training=X_initial,
            y_training=Y_initial)
    elif method == "MST-BE":
        learner = ActiveLearner(
            estimator=SVC(probability=True),
            # estimator = SupervisedOPF(distance = "log_squared_euclidean", pre_computed_distance = None),
            query_strategy=disagree_labels_edges_idx_query_strategy,
            X_training=X_initial,
            y_training=Y_initial)
    timeToTrain = time.time() - t

    results = []

    labeledData_X = X_initial
    labeledData_Y = Y_initial

    for run in range(interations):

        if K > len(idx): break

        if method in ["RDS", "MST-BE"]:

            kwargs = dict()
            if K > len(idx): break
            kwargs = dict(idx=idx, labeled_idx=labeled_idx, y_root=Y_initial)

            t = time.time()
            query_idx, idx = learner.query(X_pool, n_instances=K, **kwargs)
            timeToSelect = time.time() - t

            if query_idx is None or len(query_idx) < K: break
            labeled_idx = np.append(labeled_idx, query_idx)

            predsCorrecteds = learner.predict(X_pool[query_idx])
            counter = 0
            for (x, y) in zip(predsCorrecteds, Y_pool[query_idx].flatten()):
                if x != y:
                    counter += 1

            t = time.time()
            learner.teach(X=X_pool[query_idx], y=Y_pool[query_idx])
            timeToTrain = time.time() - t

            labeledData_X = np.vstack((labeledData_X, X_pool[query_idx]))
            labeledData_Y = np.vstack((labeledData_Y, Y_pool[query_idx]))
            t = time.time()
            # model = SupervisedOPF(distance = "log_squared_euclidean", pre_computed_distance = None)
            # trained_model = model.fit(labeledData_X, labeledData_Y.flatten().astype("int"))
            preds = learner.predict(X_test.values)
            timeToTest = time.time() - t

            acc = accuracy_score(Y_test, preds)
            f1score = f1_score(Y_test, preds, average='macro')
            precision = precision_score(Y_test, preds, average='macro')
            recall = recall_score(Y_test, preds, average='macro')
            knowClasses = len(set(preds.tolist()))

            print("Run {}: Acc: {}".format(run + 1, acc))
            print("Know Classes: {}".format(knowClasses))
            print("Corrected Labels: {}".format(counter))
            print("Time to Select: {}".format(timeToSelect))
        else:
            if run == 0:

                t = time.time()
                # model = SupervisedOPF(distance = "log_squared_euclidean", pre_computed_distance = None)
                # trained_model = model.fit(labeledData_X, labeledData_Y.flatten().astype("int"))
                preds = learner.predict(X_test.values)
                timeToTest = time.time() - t

                acc = accuracy_score(Y_test, preds)
                f1score = f1_score(Y_test, preds, average='macro')
                precision = precision_score(Y_test, preds, average='macro')
                recall = recall_score(Y_test, preds, average='macro')
                knowClasses = len(set(preds.tolist()))
                counter = len(Y_initial)
                timeToSelect = 0

                print("Run {}: Acc: {}".format(run + 1, acc))
                print("Know Classes: {}".format(knowClasses))
                print("Corrected Labels: {}".format(counter))
                print("Time to Select: {}".format(timeToSelect))
            else:
                try:
                    t = time.time()
                    query_idx, idx = learner.query(X_pool, n_instances=K)
                    timeToSelect = time.time() - t
                except:
                    timeToSelect = 0
                    print("deu erro")
                    break

                predsCorrecteds = learner.predict(X_pool[query_idx])
                counter = 0
                for (x, y) in zip(predsCorrecteds,
                                  Y_pool[query_idx].flatten()):
                    if x != y:
                        counter += 1

                t = time.time()
                learner.teach(X=X_pool[query_idx], y=Y_pool[query_idx])
                # X_pool, Y_pool = np.delete(X_pool, query_idx, axis=0), np.delete(Y_pool, query_idx, axis=0)
                timeToTrain = time.time() - t

                # t = time.time()
                # preds = learner.predict(X_test)
                # timeToTest = time.time() - t

                labeledData_X = np.vstack((labeledData_X, X_pool[query_idx]))
                labeledData_Y = np.vstack((labeledData_Y, Y_pool[query_idx]))
                t = time.time()
                # model = SupervisedOPF(distance = "log_squared_euclidean", pre_computed_distance = None)
                # trained_model = model.fit(labeledData_X, labeledData_Y.flatten().astype("int"))
                preds = learner.predict(X_test.values)
                X_pool, Y_pool = np.delete(X_pool, query_idx,
                                           axis=0), np.delete(Y_pool,
                                                              query_idx,
                                                              axis=0)
                timeToTest = time.time() - t

                acc = accuracy_score(Y_test, preds)
                f1score = f1_score(Y_test, preds, average='macro')
                precision = precision_score(Y_test, preds, average='macro')
                recall = recall_score(Y_test, preds, average='macro')
                knowClasses = len(set(preds.tolist()))

                print("Run {}: Acc: {}".format(run + 1, acc))
                print("Know Classes: {}".format(knowClasses))
                print("Corrected Labels: {}".format(counter))
                print("Time to Select: {}".format(timeToSelect))

        results.append([
            run + 1, K,
            np.round(timeToTrain, 2),
            np.round(timeToTest, 2),
            np.round(timeToSelect, 2),
            np.round(acc * 100, 2),
            np.round(f1score * 100, 2),
            np.round(precision * 100, 2),
            np.round(recall * 100, 2), knowClasses, counter
        ])

    results_df = pd.DataFrame(results,
                              columns=[
                                  "iteration", "k-value", "time-to-train",
                                  "time-to-test", "time-to-select", "accuracy",
                                  "f1-score", "precision", "recall",
                                  "knowClasses", "correctedLabels"
                              ])

    return results_df

예제 #30

0

파일 보기

ax.set_title("ActiveLearner class predictions (Accuracy: {score:.3f})".format(score=unqueried_score))
plt.show()
'''

N_QUERIES = 20
performance_history = [unqueried_score]

# Allow our model to query our unlabeled dataset for the most
# informative points according to our query strategy (uncertainty sampling).
for index in range(N_QUERIES):
    
  query_index = np.random.randint(low = 0, high=len(X_pool), size=1)

  # Teach our ActiveLearner model the record it has requested.
  X, y = X_pool[query_index].reshape(1, -1), y_pool[query_index].reshape(1, )
  learner.teach(X=X, y=y)

  # Remove the queried instance from the unlabeled pool.
  X_pool, y_pool = np.delete(X_pool, query_index, axis=0), np.delete(y_pool, query_index)

  # Calculate and report our model's accuracy.
  model_accuracy = learner.score(X_raw, y_raw)
  print('Accuracy after query {n}: {acc:0.4f}'.format(n=index + 1, acc=model_accuracy))

  # Save our model's performance for plotting.
  performance_history.append(model_accuracy)


# Plot our performance over time.
fig, ax = plt.subplots(figsize=(8.5, 6), dpi=130)

예제 #31

0

파일 보기

파일: pool-based_sampling.py 프로젝트: zhuwenxiao/modAL

# visualizing initial prediction
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(7, 7))
    prediction = learner.predict(iris['data'])
    plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50)
    plt.title('Initial accuracy: %f' % learner.score(iris['data'], iris['target']))
    plt.show()

print('Accuracy before active learning: %f' % learner.score(iris['data'], iris['target']))

# pool-based sampling
n_queries = 20
for idx in range(n_queries):
    query_idx, query_instance = learner.query(X_pool)
    learner.teach(
        X=X_pool[query_idx].reshape(1, -1),
        y=y_pool[query_idx].reshape(1, )
    )
    # remove queried instance from pool
    X_pool = np.delete(X_pool, query_idx, axis=0)
    y_pool = np.delete(y_pool, query_idx)
    print('Accuracy after query no. %d: %f' % (idx+1, learner.score(iris['data'], iris['target'])))

# plotting final prediction
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(7, 7))
    prediction = learner.predict(iris['data'])
    plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50)
    plt.title('Classification accuracy after %i queries: %f' % (n_queries, learner.score(iris['data'], iris['target'])))
    plt.show()

예제 #32

0

파일 보기

파일: stream_based_sampling.py 프로젝트: zhuwenxiao/modAL

# create the data to stream from
X_full = np.transpose(
    [np.tile(np.asarray(range(im.shape[0])), im.shape[1]),
     np.repeat(np.asarray(range(im.shape[1])), im.shape[0])]
)
# map the intensity values against the grid
y_full = np.asarray([im[P[0], P[1]] for P in X_full])

# assembling initial training set
n_initial = 5
initial_idx = np.random.choice(range(len(X_full)), size=n_initial, replace=False)
X_train, y_train = X_full[initial_idx], y_full[initial_idx]

# initialize the learner
learner = ActiveLearner(
    predictor=RandomForestClassifier(),
    X_initial=X_train, y_initial=y_train
)

"""
The instances are randomly selected one by one, if an instance's uncertainty
is above a threshold, the label is requested and shown to the learner. The
process is continued until the learner reaches a previously defined accuracy.
"""

# learning until the accuracy reaches a given threshold
while learner.score(X_full, y_full) < 0.7:
    stream_idx = np.random.choice(range(len(X_full)))
    if classifier_uncertainty(learner, X_full[stream_idx].reshape(1, -1)) >= 0.4:
        learner.teach(X_full[stream_idx].reshape(1, -1), y_full[stream_idx].reshape(-1, ))

예제 #33

0

파일 보기

from modAL.models import ActiveLearner
from modAL.disagreement import max_std_sampling

np.random.seed(0)

# generating the data
X = np.random.choice(np.linspace(0, 20, 10000), size=200,
                     replace=False).reshape(-1, 1)
y = np.sin(X) + np.random.normal(scale=0.3, size=X.shape)

# assembling initial training set
n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
X_initial, y_initial = X[initial_idx], y[initial_idx]

# defining the kernel for the Gaussian process
kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
         + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))

# initializing the active learner
regressor = ActiveLearner(estimator=GaussianProcessRegressor(kernel=kernel),
                          query_strategy=max_std_sampling,
                          X_training=X_initial.reshape(-1, 1),
                          y_training=y_initial.reshape(-1, 1))

# active learning
n_queries = 10
for idx in range(n_queries):
    query_idx, query_instance = regressor.query(X)
    regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))

예제 #34

0

파일 보기

    0, im_height - 1, im_height * (im_height - 1), -1,
    im_width // 2 + im_height // 2 * im_height
]
X_train, y_train = X_pool[initial_idx], y_pool[initial_idx]

# create an ActiveLearner instance
learner = ActiveLearner(predictor=RandomForestClassifier(),
                        X_initial=X_train,
                        y_initial=y_train)
initial_prediction = learner.predict_proba(X_full)[:, 1].reshape(
    im_height, im_width)

n_queries = 100
for round_idx in range(n_queries):
    query_idx, query_inst = learner.query(X_pool)
    learner.teach(X_pool[query_idx].reshape(1, -1),
                  y_pool[query_idx].reshape(-1, ))
    X_pool = np.delete(X_pool, query_idx, axis=0)
    y_pool = np.delete(y_pool, query_idx)

final_prediction = learner.predict_proba(X_full)[:, 1].reshape(
    im_height, im_width)

# learning with randomly selected queries instead of active learning
random_idx = initial_idx + list(
    np.random.choice(range(len(X_full)), n_queries, replace=False))
X_train, y_train = X_full[initial_idx], y_full[initial_idx]
random_learner = ActiveLearner(predictor=RandomForestClassifier(),
                               X_initial=X_train,
                               y_initial=y_train)

with plt.style.context('seaborn-white'):

예제 #35

0

파일 보기

파일: keras_integration.py 프로젝트: zhuwenxiao/modAL

# generate the pool
# remove the initial data from the training dataset
X_pool = np.delete(X_train, initial_idx, axis=0)
y_pool = np.delete(y_train, initial_idx, axis=0)

"""
Training the ActiveLearner
"""

# initialize ActiveLearner
learner = ActiveLearner(
    predictor=classifier,
    X_initial=X_initial, y_initial=y_initial,
    verbose=0
)

# the active learning loop
n_queries = 10
for idx in range(n_queries):
    query_idx, query_instance = learner.query(X_pool, n_instances=200, verbose=0)
    learner.teach(
        X=X_pool[query_idx], y=y_pool[query_idx],
        verbose=0
    )
    # remove queried instance from pool
    X_pool = np.delete(X_pool, query_idx, axis=0)
    y_pool = np.delete(y_pool, query_idx, axis=0)

# the final accuracy score
print(learner.score(X_test, y_test, verbose=0))