Exemplo n.º 1
0
    def uncertainty_values(self, data, target, X_train, y_train, X_full,
                           y_full, train_idx):
        print("START: ST")
        # initializing the active learner
        learner = ActiveLearner(estimator=RandomForestClassifier(),
                                query_strategy=margin_sampling,
                                X_training=X_train,
                                y_training=y_train)
        print('%f' % learner.score(X_full, y_full))
        index = 0
        # learning until the accuracy reaches a given threshold
        while learner.score(X_full, y_full) < 0.90:
            stream_idx = np.random.choice(range(len(X_full)))
            if classifier_uncertainty(learner, X_full[stream_idx].reshape(
                    1, -1)) >= 0.4:

                print("[ %1.3f, %1.3f]" %
                      (classifier_uncertainty(
                          learner, X_full[stream_idx].reshape(1, -1))[0],
                       classifier_margin(learner, X_full[stream_idx].reshape(
                           1, -1))[0]))

                learner.teach(X_full[stream_idx].reshape(1, -1),
                              y_full[stream_idx].reshape(-1, ))
                learner_score = learner.score(X_full, y_full)
                # print('Item no. %d queried, new accuracy: %f' % (stream_idx, learner_score))
                # print('%f' % (learner_score))
                if index == 50:
                    break
                index = index + 1
        print("START: ST")
Exemplo n.º 2
0
def uncertainty_batch_sampling(classifier: Union[BaseLearner, BaseCommittee],
                               X: Union[np.ndarray, sp.csr_matrix],
                               n_instances: int = 20,
                               metric: Union[str, Callable] = 'euclidean',
                               n_jobs: Optional[int] = None,
                               **uncertainty_measure_kwargs
                               ) -> Tuple[np.ndarray, Union[np.ndarray, sp.csr_matrix]]:
    """
    Batch sampling query strategy. Selects the least sure instances for labelling.

    This strategy differs from :func:`~modAL.uncertainty.uncertainty_sampling` because, although it is supported,
    traditional active learning query strategies suffer from sub-optimal record selection when passing
    `n_instances` > 1. This sampling strategy extends the interactive uncertainty query sampling by allowing for
    batch-mode uncertainty query sampling. Furthermore, it also enforces a ranking -- that is, which records among the
    batch are most important for labeling?

    Refer to Cardoso et al.'s "Ranked batch-mode active learning":
        https://www.sciencedirect.com/science/article/pii/S0020025516313949

    Args:
        classifier: One of modAL's supported active learning models.
        X: Set of records to be considered for our active learning model.
        n_instances: Number of records to return for labeling from `X`.
        metric: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`
        n_jobs: If not set, :func:`~sklearn.metrics.pairwise.pairwise_distances_argmin_min` is used for calculation of
            distances between samples. Otherwise it is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`.
        **uncertainty_measure_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier.

    Returns:
        Indices of the instances from `X` chosen to be labelled; records from `X` chosen to be labelled.
    """
    uncertainty = classifier_uncertainty(classifier, X, **uncertainty_measure_kwargs)
    query_indices = ranked_batch(classifier, unlabeled=X, uncertainty_scores=uncertainty,
                                 n_instances=n_instances, metric=metric, n_jobs=n_jobs)
    return query_indices, X[query_indices]
Exemplo n.º 3
0
    def al_stream(self, data, target, X_train, y_train, X_full, y_full,
                  train_idx):
        # initializing the active learner
        acc = []
        learner = ActiveLearner(estimator=RandomForestClassifier(),
                                query_strategy=margin_sampling,
                                X_training=X_train,
                                y_training=y_train)

        # print('Initial prediction accuracy: %f' % learner.score(X_full, y_full))
        index = 0
        # learning until the accuracy reaches a given threshold
        while learner.score(X_full, y_full) < 0.90:
            stream_idx = np.random.choice(range(len(X_full)))
            if classifier_uncertainty(learner, X_full[stream_idx].reshape(
                    1, -1)) >= 0.2:
                learner.teach(X_full[stream_idx].reshape(1, -1),
                              y_full[stream_idx].reshape(-1, ))
                learner_score = learner.score(X_full, y_full)
                # print('Item no. %d queried, new accuracy: %f' % (stream_idx, learner_score))
                print('%0.3f' % (learner_score), end=",")
                if index == self.query_number:
                    break
                index = index + 1
                acc.append(learner_score)
        return acc
def active_learn(df1, first_item_index_of_each_category):
    train_idx = first_item_index_of_each_category
    # X_train = iris['data'][train_idx]
    # y_train = iris['target'][train_idx]

    # initial training data
    data = df1.values[:, 1:]
    target = df1['label'].values

    X_full = df1.values[:, 1:]
    y_full = df1['label'].values

    X_train = df1.values[:, 1:][
        train_idx]  #item from second column as the first column is the label..
    y_train = df1['label'].values[train_idx]

    # with plt.style.context('seaborn-white'):
    #     pca = PCA(n_components=2).fit_transform(data)
    #     plt.figure(figsize=(7, 7))
    #     plt.scatter(x=pca[:, 0], y=pca[:, 1], c=y_train, cmap='viridis', s=50)
    #     plt.title('The iris dataset')
    #     plt.show()

    # generating the pool
    X_pool = np.delete(data, train_idx, axis=0)
    y_pool = np.delete(target, train_idx)

    # initializing the active learner
    learner = ActiveLearner(estimator=RandomForestClassifier(),
                            query_strategy=entropy_sampling,
                            X_training=X_train,
                            y_training=y_train)

    # print('Initial prediction accuracy: %f' % learner.score(X_full, y_full))
    print('%f' % learner.score(X_full, y_full))
    index = 0
    performance_array = []
    # learning until the accuracy reaches a given threshold
    while learner.score(X_full, y_full) < 0.90:
        stream_idx = np.random.choice(range(len(X_full)))
        if classifier_uncertainty(learner, X_full[stream_idx].reshape(
                1, -1)) >= 0.4:
            learner.teach(X_full[stream_idx].reshape(1, -1),
                          y_full[stream_idx].reshape(-1, ))
            learner_score = learner.score(X_full, y_full)
            # print('Item no. %d queried, new accuracy: %f' % (stream_idx, learner_score))
            print('%f' % (learner_score))
            if index == 505:
                break
            if (index % 100 == 0):
                performance_array.append(learner_score)
            index = index + 1
    percentage_increase(performance_array)

    # visualizing initial prediction
    # with plt.style.context('seaborn-white'):
    #     plt.figure(figsize=(7, 7))
    #     prediction = learner.predict(data)
    #     plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50)
    #     plt.title('Initial accuracy: %f' % learner.score(data, target))
    #     plt.show()

    # pool-based sampling
    # n_queries = 502
    # performance_array = []
    # for idx in range(n_queries):
    #     query_idx, query_instance = learner.query(X_pool)
    #     learner.teach(
    #         X=X_pool[query_idx].reshape(1, -1),
    #         y=y_pool[query_idx].reshape(1, )
    #     )
    #     # remove queried instance from pool
    #     X_pool = np.delete(X_pool, query_idx, axis=0)
    #     y_pool = np.delete(y_pool, query_idx)
    #     learner_score = learner.score(data, target)
    #     print('Accuracy after query no. %d: %f' % (idx + 1, learner_score))
    #     if (idx % 100 == 0):
    #         performance_array.append(learner_score)
    #
    # percentage_increase(performance_array)

    # plotting final prediction
    # with plt.style.context('seaborn-white'):
    #     plt.figure(figsize=(7, 7))
    #     prediction = learner.predict(data)
    #     plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50)
    #     plt.title(
    #         'Classification accuracy after %i queries: %f' % (n_queries, learner.score(data,target)))
    #     plt.show()
    y = 0
)
print('Initial prediction accuracy: %f' % learner.score(X_full, y_full))

# visualizing initial prediciton
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(7, 7))
    prediction = learner.predict_proba(X_full)[:, 1]
    plt.imshow(prediction.reshape(im_width, im_height))
    plt.title('Initial prediction accuracy: %f' % learner.score(X_full, y_full))
    plt.show()

"""
The instances are randomly selected one by one, if an instance's uncertainty
is above a threshold, the label is requested and shown to the learner. The
process is continued until the learner reaches a previously defined accuracy.
"""

# learning until the accuracy reaches a given threshold
while learner.score(X_full, y_full) < 0.90:
    stream_idx = np.random.choice(range(len(X_full)))
    if classifier_uncertainty(learner, X_full[stream_idx].reshape(1, -1)) >= 0.4:
        learner.teach(X_full[stream_idx].reshape(1, -1), y_full[stream_idx].reshape(-1, ))
        print('Pixel no. %d queried, new accuracy: %f' % (stream_idx, learner.score(X_full, y_full)))

# visualizing final prediciton
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(7, 7))
    prediction = learner.predict_proba(X_full)[:, 1]
    plt.imshow(prediction.reshape(im_width, im_height))
    plt.title('Final prediction accuracy: %f' % learner.score(X_full, y_full))
    plt.show()
Exemplo n.º 6
0
# create the data to stream from
X_full = np.transpose(
    [np.tile(np.asarray(range(im.shape[0])), im.shape[1]),
     np.repeat(np.asarray(range(im.shape[1])), im.shape[0])]
)
# map the intensity values against the grid
y_full = np.asarray([im[P[0], P[1]] for P in X_full])

# assembling initial training set
n_initial = 5
initial_idx = np.random.choice(range(len(X_full)), size=n_initial, replace=False)
X_train, y_train = X_full[initial_idx], y_full[initial_idx]

# initialize the learner
learner = ActiveLearner(
    predictor=RandomForestClassifier(),
    X_initial=X_train, y_initial=y_train
)

"""
The instances are randomly selected one by one, if an instance's uncertainty
is above a threshold, the label is requested and shown to the learner. The
process is continued until the learner reaches a previously defined accuracy.
"""

# learning until the accuracy reaches a given threshold
while learner.score(X_full, y_full) < 0.7:
    stream_idx = np.random.choice(range(len(X_full)))
    if classifier_uncertainty(learner, X_full[stream_idx].reshape(1, -1)) >= 0.4:
        learner.teach(X_full[stream_idx].reshape(1, -1), y_full[stream_idx].reshape(-1, ))
x = 40

queries = int((x / 100) * 150)
accuracy_list = []
accuracy_list.append(committee.score(X, Y))

# In[69]:

iter = 0
print("Accuracy after", 0, "iterations :", committee.score(X, Y))

for i in range(0, queries):

    for x in range(135):
        if (classifier_uncertainty(committee, X_unlab[iter].reshape(1, -1)) >=
                0.8):
            break
        iter = (iter + 1) % (X_unlab.shape[0])

    q_id = iter - 1

    X_new = X_unlab[q_id].reshape(1, -1)
    Y_new = Y_unlab[q_id].reshape(1, )

    X_unlab, Y_unlab = np.asarray(np.delete(X_unlab, q_id,
                                            axis=0)), np.delete(Y_unlab,
                                                                q_id,
                                                                axis=0)
    committee.teach(X_new, Y_new)
# creating new utility measures by linear combination and product
# linear_combination will return 1.0*classifier_uncertainty + 1.0*classifier_margin
linear_combination = make_linear_combination(
    classifier_uncertainty, classifier_margin,
    weights=[1.0, 1.0]
)
# product will return (classifier_uncertainty**0.5)*(classifier_margin**0.1)
product = make_product(
    classifier_uncertainty, classifier_margin,
    exponents=[0.5, 0.1]
)

# visualizing the different utility metrics
with plt.style.context('seaborn-white'):
    utilities = [
        (1, classifier_uncertainty(learner, X), 'Classifier uncertainty'),
        (2, classifier_margin(learner, X), 'Classifier margin'),
        (3, linear_combination(learner, X), '1.0*uncertainty + 1.0*margin'),
        (4, product(learner, X), '(uncertainty**0.5)*(margin**0.5)')
    ]

    plt.figure(figsize=(18, 14))
    for idx, utility, title in utilities:
        plt.subplot(2, 2, idx)
        plt.scatter(x=X[:, 0], y=X[:, 1], c=utility, cmap='viridis', s=50)
        plt.title(title)
        plt.colorbar()

    plt.show()