Exemplo n.º 1
0
class Review(ABC):
    """Base class for Systematic Review"""
    def __init__(self,
                 X,
                 y=None,
                 model=None,
                 query_strategy=None,
                 train_data_fn=full_sample,
                 n_instances=1,
                 n_queries=None,
                 prior_included=[],
                 prior_excluded=[],
                 log_file=None,
                 settings={},
                 verbose=1):
        super(Review, self).__init__()

        self.X = X
        self.y = y
        self.model = model
        self.query_strategy = query_strategy
        self.train_data = train_data_fn

        self.n_instances = n_instances
        self.n_queries = n_queries
        self.log_file = log_file
        self.verbose = verbose

        self.prior_included = prior_included
        self.prior_excluded = prior_excluded

        self.fit_kwargs = settings['fit_kwargs']
        self.balance_kwargs = settings['balance_kwargs']
        self.query_kwargs = settings['query_kwargs']

        self._logger = Logger()

    @abstractmethod
    def _prior_knowledge(self):
        pass

    @abstractmethod
    def _classify(self, ind):
        """Classify the provided indices."""
        pass

    def _prior_teach(self):
        """Function called before training model."""

        pass

    def _stop_iter(self, query_i, pool):
        """Criteria for stopping iteration.

        Stop iterating if:
            - n_queries is reached
            - the pool is empty
        """

        stop_iter = False

        # if the pool is empty, always stop
        if len(pool) == 0:
            stop_iter = True

        # don't stop if there is no stopping criteria
        if self.n_queries is not None and query_i >= self.n_queries:
            stop_iter = True

        return stop_iter

    def review(self):

        # create the pool and training indices.
        n_samples = self.X.shape[0]
        pool_idx = np.arange(n_samples)

        # add prior knowledge
        init_idx, init_labels = self._prior_knowledge()
        self.y[init_idx] = init_labels

        # remove the initial sample from the pool
        pool_idx = np.delete(pool_idx, init_idx)

        # Initialize learner, but don't start training yet.
        self.learner = ActiveLearner(estimator=self.model,
                                     query_strategy=self.query_strategy)
        query_i = 0
        train_idx = init_idx.copy()
        query_idx = train_idx
        self._logger.add_labels(self.y)

        while not self._stop_iter(query_i - 1, pool_idx):
            self._logger.add_training_log(query_idx, self.y[query_idx])

            # Get the training data.
            X_train, y_train = self.train_data(self.X, self.y, train_idx,
                                               **self.balance_kwargs)
            #             validation_data(self.X[pool_idx], self.y[pool_idx],
            #                             self.fit_kwargs, ratio=1)

            # Train the model on the training data.
            self.learner.teach(X=X_train,
                               y=y_train,
                               only_new=True,
                               **self.fit_kwargs)

            # Make a query from the pool.
            query_idx, _ = self.learner.query(X=self.X,
                                              pool_idx=pool_idx,
                                              n_instances=min(
                                                  self.n_instances,
                                                  len(pool_idx)),
                                              query_kwargs=self.query_kwargs)

            # Log the probabilities of samples in the pool being included.
            pred_proba = self.query_kwargs.get('pred_proba', [])
            if len(pred_proba) == 0:
                pred_proba = self.learner.predict_proba(self.X[pool_idx])
            self._logger.add_proba(pool_idx, pred_proba)

            # Log the probabilities of samples that were trained.
            pred_proba_train = self.learner.predict_proba(self.X[train_idx])
            self._logger.add_proba(train_idx,
                                   pred_proba_train,
                                   logname="train_proba")

            # Classify the queried papers.
            self.y[query_idx] = self._classify(query_idx)
            self._logger.add_labels(self.y)

            # Update training/pool indices
            train_idx = np.append(train_idx, query_idx)
            pool_idx = np.delete(np.arange(n_samples), train_idx, axis=0)

            # update the query counter
            query_i += 1

            # Save the result to a file
            if self.log_file:
                self.save_logs(self.log_file)
                if self.verbose:
                    print(f"Saved results in log file: {self.log_file}")

    def save_logs(self, *args, **kwargs):
        """Save the logs to a file."""

        self._logger.save(*args, **kwargs)
Exemplo n.º 2
0
from modAL.models import ActiveLearner

np.random.seed(0)

# loading the iris dataset
iris = load_iris()

# initial training data
train_idx = [0, 50, 100]
X_train = iris['data'][train_idx]
y_train = iris['target'][train_idx]

# generating the pool
X_pool = np.delete(iris['data'], train_idx, axis=0)
y_pool = np.delete(iris['target'], train_idx)

# initializing the active learner
learner = ActiveLearner(predictor=KNeighborsClassifier(n_neighbors=3),
                        X_initial=X_train,
                        y_initial=y_train)

# pool-based sampling
n_queries = 20
for idx in range(n_queries):
    query_idx, query_instance = learner.query(X_pool)
    learner.teach(X=X_pool[query_idx].reshape(1, -1),
                  y=y_pool[query_idx].reshape(1, ))
    # remove queried instance from pool
    X_pool = np.delete(X_pool, query_idx, axis=0)
    y_pool = np.delete(y_pool, query_idx)
a3=np.array(np.where(labels==2))
a4=np.array(np.where(labels==3))
b1=np.random.choice(a1[0,:],20)
b2=np.random.choice(a2[0,:],20)
b3=np.random.choice(a3[0,:],20)
b4=np.random.choice(a4[0,:],20)
c1=x[b1]
c2=x[b2]
c3=x[b3]
c4=x[b4]
d1=label[b1]
d2=label[b2]
d3=label[b3]
d4=label[b4]
train_data=np.concatenate((c1,c2,c3,c4),axis=0)
train_label=np.concatenate((d1,d2,d3,d4),axis=0)
index=np.arange(len(train_data))
np.random.shuffle(index)
train_data,train_label=train_data[index],train_label[index]
learner = ActiveLearner(estimator=RandomForestClassifier(),X_training=train_data, y_training=train_label)
unqueried_score = learner.score(x,label)
performance_history = [unqueried_score]

while learner.score(x, label) < 0.97:
    stream_idx = np.random.choice(range(len(x)))
    idx = np.random.choice(range(len(train_data)))
    if classifier_uncertainty(learner, x[stream_idx].reshape(1, -1)) >= 0.4:
        learner.teach(train_data[idx].reshape(1, -1), train_label[idx].reshape(-1, ))
        new_score = learner.score(x, label)
        performance_history.append(new_score)
        print('Data no. %d queried, new accuracy: %f' % (idx, new_score))
# remove the initial data from the training dataset
X_pool = np.delete(X_train, initial_idx, axis=0)
names_pool = np.delete(name, initial_idx, axis=0)
y_pool = np.delete(y_train, initial_idx, axis=0)
#print(np.shape(X_pool), 'X_pool')
print(y_pool[:20], 'y_pool')

#### Active Learner

# QUERY strategy 1
# initialize ActiveLearner

if args.query_strategy == "uncertainty":
    learner = ActiveLearner(
        estimator=net,
        query_strategy=modAL.uncertainty.uncertainty_sampling,
        X_training=X_initial,
        y_training=y_initial,
    )

# QUERY strategy 2
####Yet another query strategy########################
elif args.query_strategy == "margin":
    learner = ActiveLearner(
        estimator=net,
        query_strategy=modAL.uncertainty.margin_sampling,
        X_training=X_initial,
        y_training=y_initial,
    )

######################################################
# QUERY strategy 3
Exemplo n.º 5
0
    def al_rank(self,
                data,
                target,
                X_train,
                y_train,
                X_full,
                y_full,
                train_idx,
                N_RAW_SAMPLES=80):

        BATCH_SIZE = 3
        preset_batch = partial(uncertainty_batch_sampling,
                               n_instances=BATCH_SIZE)

        learner = ActiveLearner(estimator=RandomForestClassifier(),
                                X_training=X_train,
                                y_training=y_train,
                                query_strategy=preset_batch)

        # N_RAW_SAMPLES = 80
        N_QUERIES = N_RAW_SAMPLES // BATCH_SIZE
        unqueried_score = learner.score(X_full, y_full)
        performance_history = [unqueried_score]

        # Isolate our examples for our labeled dataset.
        n_labeled_examples = X_full.shape[0]
        training_indices = np.random.randint(low=0,
                                             high=n_labeled_examples + 1,
                                             size=3)

        X_train = X_full[training_indices]
        y_train = y_full[training_indices]

        # Isolate the non-training examples we'll be querying.
        X_pool = np.delete(X_full, training_indices, axis=0)
        y_pool = np.delete(y_full, training_indices, axis=0)

        acc = []
        for index in range(N_QUERIES):
            query_index, query_instance = learner.query(X_pool)

            # Teach our ActiveLearner model the record it has requested.
            X, y = X_pool[query_index], y_pool[query_index]
            learner.teach(X=X, y=y)

            # Remove the queried instance from the unlabeled pool.
            X_pool = np.delete(X_pool, query_index, axis=0)
            y_pool = np.delete(y_pool, query_index)

            # Calculate and report our model's accuracy.
            model_accuracy = learner.score(X_full, y_full)
            print('Accuracy after query {n}: {acc:0.4f}'.format(
                n=index + 1, acc=model_accuracy))
            acc.append(model_accuracy)
            # Save our model's performance for plotting.
            performance_history.append(model_accuracy)
        # acc = []
        # X_pool = np.delete(data, train_idx, axis=0)
        # y_pool = np.delete(target, train_idx)
        # learner = ActiveLearner(
        #     estimator=RandomForestClassifier(),
        #     X_training=X_train, y_training=y_train
        # )
        #
        # n_queries = self.query_number
        # # n_queries = 1500
        # for idx in range(n_queries):
        #     query_idx, query_instance = learner.query(X_pool)
        #     learner.teach(
        #         X=X_pool[query_idx].reshape(1, -1),
        #         y=y_pool[query_idx].reshape(1, )
        #     )
        #     # remove queried instance from pool
        #     X_pool = np.delete(X_pool, query_idx, axis=0)
        #     y_pool = np.delete(y_pool, query_idx)
        #     learner_score = learner.score(data, target)
        #     # print('Accuracy after query no. %d: %f' % (idx + 1, learner_wscore))
        #     acc.append(learner_score)
        #     print('%0.3f' % (learner_score), end=",")
        return acc
Exemplo n.º 6
0
class CustomAcitveLearner(BaseModel):
    def __init__(self, X_train, y_train, X_test, y_test, epochs=10, batch_size=128, lr=1e-3, n_initial=100, n_queries=100, query_strategy=uncertainty_sampling, estimator=None):
        super().__init__(X_train, y_train, X_test, y_test, epochs, batch_size, lr)

        self.X_train = X_train
        self.y_train = y_train

        self.X_test = X_test
        self.y_test = y_test

        self.epochs = epochs
        self.batch_size = batch_size
        self.lr = lr

        self.n_initial = n_initial
        self.n_queries = n_queries
        self.query_strategy = query_strategy

        initial_idx = np.random.choice(range(len(self.X_train)), size=self.n_initial, replace=False)
        self.__X_initial = self.X_train[initial_idx]
        self.__y_initial = self.y_train[initial_idx]
        
        self.__X_pool = np.delete(self.X_train, initial_idx, axis=0)
        self.__y_pool = np.delete(self.y_train, initial_idx, axis=0)

        self.learner = ActiveLearner(
            estimator=DL.LeNet(self.lr),
            query_strategy=self.query_strategy,
            X_training=self.__X_initial, y_training=self.__y_initial,
            verbose=1
        )

        BaseModel.estimator = self.learner        

    def train(self):
        performances = [self.evaluate(self.X_test, self.y_test)]
        for idx in range(self.n_queries):
            try:
                query_idx, query_instance = self.learner.query(self.__X_pool, verbose=0)
            except:
                break

            placeholder = st.empty()
            with plt.style.context('seaborn-white'):
                plt.figure(figsize=(10, 5))
                plt.subplot(1, 2, 1)
                plt.title('Digit to label')
                plt.imshow(query_instance.reshape(8, 8))
                plt.subplot(1, 2, 2)
                plt.title('Accuracy of your model')
                plt.plot(range(idx+1), performances)
                plt.scatter(range(idx+1), performances)
                plt.xlabel('number of queries')
                plt.ylabel('accuracy')
                
                plt.savefig('../buf.png', format='png')

                with placeholder.beta_container():
                    st.image('../buf.png', use_column_width=True)
                    time.sleep(0.5)
                plt.close()
            placeholder.empty()
            
            self.learner.teach(
                X=self.__X_pool[query_idx], y=self.__y_pool[query_idx], epochs=self.epochs, batch_size=self.batch_size, verbose=0
            )
            self.__X_pool = np.delete(self.__X_pool, query_idx, axis=0)
            self.__y_pool = np.delete(self.__y_pool, query_idx, axis=0)

            model_accuracy = self.evaluate(self.X_test, self.y_test)
            performances.append(model_accuracy)
                        
            # with st.beta_container():
            #     info = 'Accuracy after query {n}: {acc:0.4f}'.format(n=idx + 1, acc=model_accuracy)
            #     st.write(info)

        return performances

    def predict(self, X):
        y_prob = super().estimator.predict(X)
        y_classes = y_prob.argmax(axis=-1)

        return y_classes

    def evaluate(self, X_test, y_test):
        from sklearn.metrics import accuracy_score
        
        y_classes = self.predict(X_test)
        return accuracy_score(y_test.argmax(axis=-1), y_classes)
Exemplo n.º 7
0
X_initial = X_train[initial_idx]
y_initial = y_train[initial_idx]

# generate the pool
# remove the initial data from the training dataset
X_pool = np.delete(X_train, initial_idx, axis=0)
y_pool = np.delete(y_train, initial_idx, axis=0)

"""
Training the ActiveLearner
"""

# initialize ActiveLearner
learner = ActiveLearner(
    predictor=classifier,
    X_initial=X_initial, y_initial=y_initial,
    verbose=0
)

# the active learning loop
n_queries = 10
for idx in range(n_queries):
    query_idx, query_instance = learner.query(X_pool, n_instances=200, verbose=0)
    learner.teach(
        X=X_pool[query_idx], y=y_pool[query_idx],
        verbose=0
    )
    # remove queried instance from pool
    X_pool = np.delete(X_pool, query_idx, axis=0)
    y_pool = np.delete(y_pool, query_idx, axis=0)
Exemplo n.º 8
0
def main():
    """
    Run an active learning experiment.

    Sample command:
    ```
    python training/run_modAL_experiment.py --al_epochs_init=10 --al_epochs_incr=5 --al_n_iter=10 --al_samples_per_iter=100 --data_class=DroughtWatch --model_class=ResnetClassifier --batch_size=64 --n_train_images=1000 --n_validation_images=1000 --pretrained=True --wandb
    ```
    """

    # generic setup steps from run_experiment
    # ---------------------------------------

    parser = _setup_parser()
    args = parser.parse_args()
    data_class = _import_class(f"active_learning.data.{args.data_class}")
    model_class = _import_class(f"active_learning.models.{args.model_class}")
    data = data_class(args)
    model = model_class(data_config=data.config(), args=args)

    if args.loss not in ("ctc", "transformer"):
        lit_model_class = lit_models.BaseLitModel

    if args.loss == "ctc":
        lit_model_class = lit_models.CTCLitModel

    if args.loss == "transformer":
        lit_model_class = lit_models.TransformerLitModel

    if args.load_checkpoint is not None:
        lit_model = lit_model_class.load_from_checkpoint(args.load_checkpoint, args=args, model=model)
    else:
        lit_model = lit_model_class(args=args, model=model)

    # modAL specific experiment setup
    # -------------------------------

    # initialize wandb with pytorch model
    if args.wandb:
        wandb.init(config=args)
        wandb.watch(model, log_freq=100)

    # evaluate query strategy from args parameter
    if args.al_query_strategy in ["uncertainty_sampling", "margin_sampling", "entropy_sampling"]:
        query_strategy = _import_class(f"modAL.uncertainty.{args.al_query_strategy}")
    else:
        query_strategy = _import_class(f"active_learning.sampling.{args.al_query_strategy}")

    # cpu vs. gpu: ignore --gpu args param, instead just set gpu based on availability
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # initialize train, validation and pool datasets
    data.setup()

    X_initial = np.moveaxis(
        data.data_train.data, 3, 1
    )  # shape change: (i, channels, h, w) instead of (i, h, w, channels)
    y_initial = data.data_train.targets
    if args.reduced_develop_train_size:
        print("NOTE: Reduced initial train set size for development activated")
        X_initial = X_initial[:100, :, :, :]
        y_initial = y_initial[:100]

    X_val = np.moveaxis(data.data_val.data, 3, 1)  # shape change
    y_val = data.data_val.targets
    X_pool = np.moveaxis(data.data_unlabelled.data, 3, 1)  # shape change
    y_pool = data.data_unlabelled.targets

    # initialize skorch classifier
    classifier = NeuralNetClassifier(
        model,
        criterion=torch.nn.CrossEntropyLoss,
        optimizer=torch.optim.Adam,
        train_split=predefined_split(Dataset(X_val, y_val)),
        verbose=1,
        device=device,
    )

    lit_model.summarize(mode="full")

    # initialize modal active learner
    print("Initializing model with base training set")
    learner = ActiveLearner(
        estimator=classifier,
        X_training=X_initial,
        y_training=y_initial,
        epochs=args.al_epochs_init,
        query_strategy=query_strategy,
    )

    _log_skorch_history(
        history=learner.estimator.history,
        al_iter=0,
        epoch_start=0,
        train_acc=learner.score(learner.X_training, learner.y_training),
        train_size=len(learner.y_training),
        wandb_logging=args.wandb,
    )

    # active learning loop
    for idx in range(args.al_n_iter):

        print("Active learning query no. %d" % (idx + 1))
        query_idx, _ = learner.query(X_pool, n_instances=args.al_samples_per_iter)
        learner.teach(
            X=X_pool[query_idx], y=y_pool[query_idx], only_new=args.al_incr_onlynew, epochs=args.al_epochs_incr
        )

        _log_skorch_history(
            history=learner.estimator.history,
            al_iter=idx + 1,
            epoch_start=args.al_epochs_init + idx * args.al_epochs_incr,
            train_acc=learner.score(learner.X_training, learner.y_training),
            train_size=len(learner.y_training),
            wandb_logging=args.wandb,
        )

        # remove queried instances from pool
        X_pool = np.delete(X_pool, query_idx, axis=0)
        y_pool = np.delete(y_pool, query_idx, axis=0)
Exemplo n.º 9
0

# In[11]:


y_initial


# ## Initialize learner

# In[12]:


learner = ActiveLearner(
    estimator=svm.SVC(kernel='linear', gamma='scale', C=2, probability = True),
    query_strategy=uncertainty_sampling,
    X_training=X_initial, y_training=y_initial
)


# In[17]:


learner.estimator


# In[18]:


# import pickle
# pickle.dump(learner.estimator, open('models/model0.sav','wb'))
Exemplo n.º 10
0
# create the data to stream from
X_full = np.transpose(
    [np.tile(np.asarray(range(im.shape[0])), im.shape[1]),
     np.repeat(np.asarray(range(im.shape[1])), im.shape[0])]
)
# map the intensity values against the grid
y_full = np.asarray([im[P[0], P[1]] for P in X_full])

# assembling initial training set
n_initial = 5
initial_idx = np.random.choice(range(len(X_full)), size=n_initial, replace=False)
X_train, y_train = X_full[initial_idx], y_full[initial_idx]

# initialize the learner
learner = ActiveLearner(
    predictor=RandomForestClassifier(),
    X_initial=X_train, y_initial=y_train
)
print('Initial prediction accuracy: %f' % learner.score(X_full, y_full))

# visualizing initial prediciton
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(7, 7))
    prediction = learner.predict_proba(X_full)[:, 1]
    plt.imshow(prediction.reshape(im_width, im_height))
    plt.title('Initial prediction accuracy: %f' % learner.score(X_full, y_full))
    plt.show()

"""
The instances are randomly selected one by one, if an instance's uncertainty
is above a threshold, the label is requested and shown to the learner. The
process is continued until the learner reaches a previously defined accuracy.
Exemplo n.º 11
0
    def __init__(self,
                 X,
                 y=None,
                 model=None,
                 query_strategy=max_sampling,
                 train_data_fn=full_sample,
                 n_instances=1,
                 n_queries=1,
                 prior_included=[],
                 prior_excluded=[],
                 log_file=None,
                 fit_kwargs={},
                 balance_kwargs={},
                 query_kwargs={},
                 logger=None,
                 verbose=1):
        super(BaseReview, self).__init__()

        self.X = X
        self.y = y
        if y is None:
            self.y = np.full(X.shape[0], NOT_AVAILABLE)

        # Default to Naive Bayes model
        if model is None:
            print("Warning: using naive Bayes model as default."
                  "If you experience bad performance, read the documentation"
                  " in order to implement a RNN based solution.")
            from asreview.models import create_nb_model
            model = create_nb_model()

        self.model = model
        self.query_strategy = query_strategy
        self.train_data = train_data_fn

        self.n_instances = n_instances
        self.n_queries = n_queries
        self.log_file = log_file
        self.verbose = verbose

        self.prior_included = prior_included
        self.prior_excluded = prior_excluded

        self.fit_kwargs = fit_kwargs
        self.balance_kwargs = balance_kwargs
        self.query_kwargs = query_kwargs

        self.query_i = 0
        self.train_idx = np.array([], dtype=np.int)
        self.model_trained = False

        self.query_kwargs["src_query_idx"] = {}

        if logger is None:
            self._logger = Logger()
            self.start_from_logger = False
        else:
            self._logger = logger
            self._prepare_with_logger()
            self.start_from_logger = True

        # Initialize learner, but don't start training yet.
        self.learner = ActiveLearner(estimator=self.model,
                                     query_strategy=self.query_strategy)
Exemplo n.º 12
0
X_full = np.transpose(
    [np.tile(np.asarray(range(data.shape[0])), data.shape[1]),
     np.repeat(np.asarray(range(data.shape[1])), data.shape[0])]
)
# map the intensity values against the grid
y_full = np.asarray([data[P[0], P[1]] for P in X_full])
X_pool = deepcopy(X_full)
y_pool = deepcopy(y_full)

# assembling initial training set
initial_idx = [0, im_height-1, im_height*(im_height-1), -1, im_width//2 + im_height//2*im_height]
X_train, y_train = X_pool[initial_idx], y_pool[initial_idx]

# create an ActiveLearner instance
learner = ActiveLearner(
    predictor=RandomForestClassifier(),
    X_initial=X_train, y_initial=y_train
)
initial_prediction = learner.predict_proba(X_full)[:, 1].reshape(im_height, im_width)

n_queries = 100
for round_idx in range(n_queries):
    query_idx, query_inst = learner.query(X_pool)
    learner.teach(X_pool[query_idx].reshape(1, -1), y_pool[query_idx].reshape(-1, ))
    X_pool = np.delete(X_pool, query_idx, axis=0)
    y_pool = np.delete(y_pool, query_idx)

final_prediction = learner.predict_proba(X_full)[:, 1].reshape(im_height, im_width)

# learning with randomly selected queries instead of active learning
random_idx = initial_idx + list(np.random.choice(range(len(X_full)), n_queries, replace=False))
X_train, y_train = X_full[initial_idx], y_full[initial_idx]
Exemplo n.º 13
0
# initial training data: 100 random pixels
initial_idx = np.random.choice(range(len(X_pool)), size=100)

# initializing the learners
n_learners = 3
learner_list = []
for _ in range(n_learners):
    learner = ActiveLearner(
        predictor=RandomForestClassifier(),
        X_initial=X_pool[initial_idx], y_initial=y_pool[initial_idx],
        bootstrap_init=True
    )
    learner_list.append(learner)

# assembling the Committee
committee = Committee(learner_list)

# ensemble active learner from the Committee
ensemble_learner = ActiveLearner(
    predictor=committee
)

query_idx, query_instance = ensemble_learner.query(X_pool)

# ...
# ... obtain label from the Oracle ...
# ...

ensemble_learner.teach(X_pool[query_idx], y_pool[query_idx], bootstrap=True)
Exemplo n.º 14
0
# loading the iris dataset
iris = load_iris()

# initial training data
train_idx = [0, 50, 100]
X_train = iris['data'][train_idx]
y_train = iris['target'][train_idx]

# generating the pool
X_pool = np.delete(iris['data'], train_idx, axis=0)
y_pool = np.delete(iris['target'], train_idx)

# initializing the active learner
learner = ActiveLearner(
    predictor=KNeighborsClassifier(n_neighbors=3),
    X_initial=X_train, y_initial=y_train
)

# pool-based sampling
n_queries = 20
for idx in range(n_queries):
    query_idx, query_instance = learner.query(X_pool)
    learner.teach(
        X=X_pool[query_idx].reshape(1, -1),
        y=y_pool[query_idx].reshape(1, )
    )
    # remove queried instance from pool
    X_pool = np.delete(X_pool, query_idx, axis=0)
    y_pool = np.delete(y_pool, query_idx)
def ProactiveLearning(train_data, test_data, budget, cost_non_unif, cost_ratio,
                      class_num, mode):
    '''
    Create an active learner with proactive query strategy and run the active learner on the given data set
    train_data: the train data, which is used to do proactive learning
    test_data: the held-out test data, which is used to compute classification accuracy
    budget: total amount of prices that is allowed to pay
    cost_non_unif: the "price list" of the variable cost oracle
    cost_ratio: defined as mean(cost_non_unif) / cost_unif
    class_num: the number of classes of the data set, for this data, it is 10
    mode: 'proactive' for proactive learning query strategy,
        'uniform' for only querying from the uniform cost oracle
        'random' for randomly querying an instance from a randomly selected oracle
    Return: a list of accuracies, a list of cumulative costs, a list of queried oracles,
        each element corresponds to each iteration.
    '''

    # cost of uniform cost oracle
    cost_unif = np.mean(cost_non_unif) / cost_ratio

    # use SVM classifier
    clf = svm.SVC(gamma='scale',
                  decision_function_shape='ovo',
                  probability=True)
    # load the initial, free labeled data
    initial_labeled_data = np.load('./initial_labeled_sample.npy')
    L_X = initial_labeled_data[:, :-1]
    L_y = initial_labeled_data[:, -1].reshape(-1, 1)
    # create an active learner with proactive learning strategy
    learner = ActiveLearner(estimator=clf,
                            query_strategy=ProactiveQuery,
                            X_training=initial_labeled_data[:, :-1],
                            y_training=initial_labeled_data[:,
                                                            -1].reshape(-1, 1))

    # Initially, the unlabeled pool of data is the entire train data
    UL_X, UL_y = train_data.iloc[:,
                                 data.columns != 'Label'].values, train_data[
                                     'Label'].values

    test_X, test_y = test_data.iloc[:,
                                    data.columns != 'Label'].values, test_data[
                                        'Label'].values

    accuracy = []
    total_cost = [0]  # here a dummy cost of 0 is added for convenience
    oracle = []

    while total_cost[-1] < budget and UL_X.shape[0] != 0:

        # TODO: implement the active learning loop with proactive learning query strategy
        x_star, k_star = learner.query(L_X, L_y, UL_X, UL_y, cost_non_unif,
                                       cost_unif, mode)
        learner.teach(UL_X[x_star:x_star + 1],
                      UL_y[x_star:x_star + 1].reshape(-1, 1))
        score = learner.score(test_X, test_y)
        accuracy.append(score)
        oracle.append(k_star)

        if (k_star == 1):
            total_cost.append(total_cost[-1] + cost_unif)
        else:  # query from non_uninf oracle
            total_cost.append(total_cost[-1] + cost_non_unif[x_star])

        #print(UL_X.shape[0], x_star, total_cost[-1])

        # Add x_star to L_X, L_Y
        L_X = np.append(L_X, UL_X[x_star:x_star + 1], axis=0)
        L_y = np.append(L_y, UL_y[x_star].reshape(-1, 1), axis=0)

        # Delete x_star from UL_X, UL_y, cost_unif, cost_non_unif
        UL_X = np.delete(UL_X, x_star, 0)
        UL_y = np.delete(UL_y, x_star, 0)
        cost_unif = np.delete(cost_unif, x_star, 0)
        cost_non_unif = np.delete(cost_non_unif, x_star, 0)

    return accuracy, total_cost[1:], oracle
Exemplo n.º 16
0
                                                              initial_idx,
                                                              axis=0)

with plt.style.context('seaborn-white'):
    plt.figure(figsize=(10, 10))
    plt.scatter(X[:, 0], X[:, 1], c='k', s=20)
    plt.scatter(X[y[:, 0] == 1, 0],
                X[y[:, 0] == 1, 1],
                facecolors='none',
                edgecolors='b',
                s=50,
                linewidths=2,
                label='class 1')
    plt.scatter(X[y[:, 1] == 1, 0],
                X[y[:, 1] == 1, 1],
                facecolors='none',
                edgecolors='r',
                s=100,
                linewidths=2,
                label='class 2')
    plt.legend()
    plt.show()

learner = ActiveLearner(estimator=OneVsRestClassifier(
    SVC(probability=True, gamma='auto')),
                        query_strategy=avg_score,
                        X_training=X_initial,
                        y_training=y_initial)

query_idx, query_inst = learner.query(X_pool)
learner.teach(X_pool[query_idx], y_pool[query_idx])
Exemplo n.º 17
0
def prepare_learner():

    estimator = RandomForestClassifier()
    preset_batch = partial(uncertainty_batch_sampling, n_instances=BATCH_SIZE)
    learner = ActiveLearner(estimator=estimator, query_strategy=preset_batch)
    return learner
x_train = x[training_indices]
y_train = y[training_indices]

x_new = x[training_indices]
y_new = y[training_indices]

# Isolate the non-training examples we'll be querying.
x_pool = np.delete(x, training_indices, axis=0)
y_pool = np.delete(y, training_indices, axis=0)

#'''

classifier1 = RandomForestClassifier(n_estimators=50, n_jobs=-1, max_depth=50)
classifier2 = KNeighborsClassifier(n_neighbors=3)
learner = ActiveLearner(estimator=classifier1,
                        X_training=x_train,
                        y_training=y_train)

predictions = learner.predict(x)
is_correct = (predictions == y)
unqueried_score = learner.score(x, y)
print('Accuracy after first 1000 random rows: {acc:0.4f}%'.format(
    acc=unqueried_score * 100))
performance_history = [unqueried_score]

count = 1
while (float(performance_history[-1] * 100) < 90):
    queryList = []
    query_index, query_instance = learner.query(x_pool, n_instances=1000)
    training_indices = np.concatenate([training_indices, query_index])
    x_temp, y_temp = x_pool[query_index], y_pool[query_index]
Exemplo n.º 19
0
    [np.tile(np.asarray(range(data.shape[0])), data.shape[1]),
     np.repeat(np.asarray(range(data.shape[1])), data.shape[0])]
)
# map the intensity values against the grid
y_pool = np.asarray([data[P[0], P[1]] for P in X_pool])

# initial training data: 1000 random pixels
initial_idx = np.random.choice(range(len(X_pool)), size=1000)

# initializing the learners
n_learners = 3
learner_list = []
for _ in range(n_learners):
    learner = ActiveLearner(
        predictor=KNeighborsClassifier(n_neighbors=10),
        X_initial=X_pool[initial_idx], y_initial=y_pool[initial_idx],
        bootstrap_init=True
    )
    learner_list.append(learner)

# assembling the Committee
committee = Committee(learner_list)

# visualizing every learner in the Committee
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(7*n_learners, 7))
    for learner_idx, learner in enumerate(committee):
        plt.subplot(1, n_learners, learner_idx+1)
        plt.imshow(learner.predict(X_pool).reshape(im_height, im_width))
        plt.title('Learner no. %d' % (learner_idx + 1))
    plt.show()
Exemplo n.º 20
0
X_pool = np.transpose([
    np.tile(np.asarray(range(data.shape[0])), data.shape[1]),
    np.repeat(np.asarray(range(data.shape[1])), data.shape[0])
])
# map the intensity values against the grid
y_pool = np.asarray([data[P[0], P[1]] for P in X_pool])

# initial training data: 1000 random pixels
initial_idx = np.random.choice(range(len(X_pool)), size=1000)

# initializing the learners
n_learners = 3
learner_list = []
for _ in range(n_learners):
    learner = ActiveLearner(estimator=KNeighborsClassifier(n_neighbors=10),
                            X_training=X_pool[initial_idx],
                            y_training=y_pool[initial_idx],
                            bootstrap_init=True)
    learner_list.append(learner)

# assembling the Committee
committee = Committee(learner_list)

# visualizing every learner in the Committee
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(7 * n_learners, 7))
    for learner_idx, learner in enumerate(committee):
        plt.subplot(1, n_learners, learner_idx + 1)
        plt.imshow(learner.predict(X_pool).reshape(im_height, im_width))
        plt.title('Learner no. %d' % (learner_idx + 1))
    plt.show()
Exemplo n.º 21
0
                               size=n_initial,
                               replace=False)
X_initial = X_train[initial_idx]
y_initial = y_train[initial_idx]

# generate the pool
# remove the initial data from the training dataset
X_pool = np.delete(X_train, initial_idx, axis=0)
y_pool = np.delete(y_train, initial_idx, axis=0)
"""
Training the ActiveLearner
"""

# initialize ActiveLearner
learner = ActiveLearner(estimator=classifier,
                        X_training=X_initial,
                        y_training=y_initial,
                        verbose=1)

# the active learning loop
n_queries = 10
for idx in range(n_queries):
    query_idx, query_instance = learner.query(X_pool,
                                              n_instances=100,
                                              verbose=0)
    print(query_idx)
    learner.teach(X=X_pool[query_idx],
                  y=y_pool[query_idx],
                  only_new=True,
                  verbose=1)
    # remove queried instance from pool
    X_pool = np.delete(X_pool, query_idx, axis=0)
Exemplo n.º 22
0
def get_AL_predict(test_feature, choose_feature, unlabel_feature, test_query, choose_query, choose_answer, unlabel_query, unlabel_answer, rec_api_test, rec_api_choose, rec_api_unlabel, w2v, idf):
    unlabel_feedback_info = feedback.get_feedback_inf(unlabel_query, choose_query, choose_answer, rec_api_unlabel, w2v, idf)
    label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf)
    X_train, y_train = get_active_data(unlabel_feedback_info, unlabel_feature)
    X_feedback, y_feedback = get_active_data(label_feedback_info, choose_feature)

    # initializing the active learner
    learner = ActiveLearner(
        # estimator=KNeighborsClassifier(n_neighbors=4),
        estimator=LogisticRegression(penalty='l1', solver='liblinear'),
        X_training=X_feedback, y_training=y_feedback
    )

    predict, sel_query, add_unlabel_feature = [], [], []
    if len(unlabel_query) > 0:
        # pool-based sampling
        n_queries = 100
        sel_idx, sel_label = [], []
        for idx in range(n_queries):
            # query_idx, query_instance = learner.query(X=X_train)
            query_idx, query_instance = uncertainty_sampling(classifier=learner, X=X_train)
            idx = int(query_idx/10)
            # print(idx, len(X_train))
            # print('uncertain', query_idx, X_train[query_idx], y_train[query_idx])
            learner.teach(
                X=X_train[query_idx].reshape(1, -1),
                y=y_train[query_idx].reshape(1, )
            )

            # add queried instance into FR
            choose_query.append(unlabel_query[idx])
            choose_answer.append(unlabel_answer[idx])
            rec_api_choose.extend(rec_api_unlabel[idx*10:idx*10+10])
            choose_feature.extend(unlabel_feature[idx*10:idx*10+10])
            # learner.teach(
            #     X=new_X_train.reshape(1, -1),
            #     y=new_y_train.reshape(1, )
            # )
            # print(unlabel_query[idx], unlabel_query[idx], rec_api_unlabel[idx*10:idx*10+10], rec_api_unlabel[idx*10:idx*10+10])

            # remove queried instance from pool
            for i in range(10):
                X_train = np.delete(X_train, idx*10, axis=0)
                y_train = np.delete(y_train, idx*10)
            del unlabel_query[idx]
            del unlabel_answer[idx]
            del rec_api_unlabel[idx*10:idx*10+10]
            del unlabel_feature[idx*10:idx*10+10]
            if len(X_train) == 0:
                break

    add_label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf)
    new_X_feedback, new_y_feedback = get_active_data(add_label_feedback_info, choose_feature)
    learner = ActiveLearner(
        # estimator=KNeighborsClassifier(n_neighbors=4),
        estimator=LogisticRegression(penalty='l1', solver='liblinear'),
        X_training=new_X_feedback, y_training=new_y_feedback
    )
    feedback_info = feedback.get_feedback_inf(test_query, choose_query, choose_answer, rec_api_test, w2v, idf)
    X = split_data.get_test_feature_matrix(feedback_info, test_feature)

    X_test = np.array(X)
    # 用反馈数据学习过后的模型来预测测试数据
    for query_idx in range(400):
        y_pre = learner.predict_proba(X=X_test[query_idx].reshape(1, -1))
        predict.append(float(y_pre[0, 1]))
        # predict.append(math.log(float(y_pre[0, 1])+1))
        # predict.extend(y_pre.tolist())
        x = X_test[query_idx].reshape(1, -1)
    # print(predict)
    # print('new_choose', len(choose_query), len(choose_answer))
    # fw = open('../data/add_FR.csv', 'a+', newline='')
    # writer = csv.writer(fw)
    # for i, fr_q in enumerate(choose_query):
    #     writer.writerow((fr_q, choose_answer[i]))
    # fw.close()

    return predict, X, new_X_feedback, new_y_feedback #sorted(sel_query)
Exemplo n.º 23
0
    def al_qbc_proba(self, data, target, X_train, y_train, X_full, y_full,
                     train_idx, committee_strategy, proba):
        acc = []
        pre = []
        rec = []
        fs = []

        X_pool = deepcopy(X_full)
        y_pool = deepcopy(y_full)

        # initializing Committee members
        n_members = 2
        learner_list = list()

        for member_idx in range(n_members):
            # initial training data
            # n_initial = 5
            # train_idx = np.random.choice(range(X_pool.shape[0]), size=n_initial, replace=False)
            # X_train = X_pool[train_idx]
            # y_train = y_pool[train_idx]

            # creating a reduced copy of the data with the known instances removed
            X_pool = np.delete(X_pool, train_idx, axis=0)
            y_pool = np.delete(y_pool, train_idx)

            # initializing learner
            learner = ActiveLearner(
                estimator=RandomForestClassifier(),
                # query_strategy=vote_entropy_sampling,
                X_training=X_train,
                y_training=y_train)
            learner_list.append(learner)
            # assembling the committee
        committee = Committee(learner_list=learner_list,
                              query_strategy=committee_strategy)

        # print('Committee initial predictions, accuracy = %1.3f' % committee.score(data, target))
        # print('%1.3f' % committee.score(data, target))

        n_queries = self.query_number
        for idx in range(n_queries):
            query_idx, query_instance = committee.query(X_pool)

            labeled_y = y_pool[query_idx].reshape(1, )

            rand_int = randint(0, 100)
            if (rand_int <= proba):
                if (y_pool[query_idx][0] == 1):
                    y_pool[query_idx][0] = 0
                    labeled_y = np.array((0)).reshape(1, )
                else:
                    y_pool[query_idx][0] = 1
                    labeled_y = np.array((1)).reshape(1, )

            # learner.teach(
            #     X=X_pool[query_idx].reshape(1, -1),
            #     y=labeled_y
            # )

            committee.teach(X=X_pool[query_idx].reshape(1, -1), y=labeled_y)
            # remove queried instance from pool
            X_pool = np.delete(X_pool, query_idx, axis=0)
            y_pool = np.delete(y_pool, query_idx)
            # learner_score = committee.score(data, target)
            # print('Committee %d th query predictions, accuracy = %1.3f' % (idx , learner_score))
            precision, recall, fscore, support, accuracy = self.performance_measure(
                learner, X_full, y_full)
            learner_score = accuracy
            acc.append(learner_score)
            pre.append(precision)
            rec.append(recall)
            fs.append(fscore)
            print('%0.3f' % (learner_score), end=",")
        return acc, pre, rec, fs
Exemplo n.º 24
0
                                     size=int(n_labeled_examples * percent))

X_train = X_raw[training_indices]
y_train = y_raw[training_indices]

# Isolate the non-training examples we'll be querying.
X_pool = np.delete(X_raw, training_indices, axis=0)
y_pool = np.delete(y_raw, training_indices, axis=0)

from sklearn.neighbors import KNeighborsClassifier
from modAL.models import ActiveLearner

# Specify our core estimator along with it's active learning model.
knn = KNeighborsClassifier(n_neighbors=3)
learner = ActiveLearner(estimator=RandomForestClassifier(),
                        query_strategy=uncertainty_sampling,
                        X_training=X_train,
                        y_training=y_train)

# Isolate the data we'll need for plotting.
predictions = learner.predict(X_raw)
is_correct = (predictions == y_raw)

# Record our learner's score on the raw data.
unqueried_score = learner.score(X_raw, y_raw)

# Plot our classification results.
'''
fig, ax = plt.subplots(figsize=(8.5, 6), dpi=130)
ax.scatter(x=x_component[is_correct],  y=y_component[is_correct],  c='g', marker='+', label='Correct',   alpha=8/10)
ax.scatter(x=x_component[~is_correct], y=y_component[~is_correct], c='r', marker='x', label='Incorrect', alpha=8/10)
ax.legend(loc='lower right')
Exemplo n.º 25
0
def mnist_cnn(nr_of_labeled_examples=60000, verbose=0):
    assert (nr_of_labeled_examples >= 100 and nr_of_labeled_examples <= 60000 and nr_of_labeled_examples % 10 == 0), \
        "Number of labeled example should be between 100 and 60000 and be dividible by 10"

    batch_size = 128
    epochs = 100

    model_path = 'best_model.h5'

    (X_train, y_train), (x_test, y_test) = load_proc_data(nr_of_labeled_examples)

    if verbose == 2: 
        print('X_train shape:', X_train.shape)
        print(X_train.shape[0], 'train samples')
        print(x_test.shape[0], 'test samples')

    model = create_model()

    # added early stopping to avoid training when it's not progressing
    es = EarlyStopping(monitor='val_accuracy', mode='max', min_delta=0.0001, patience=20, verbose=1, restore_best_weights=True)
    mc = ModelCheckpoint(model_path, monitor='val_accuracy', mode='max', save_best_only=True)

    model.fit(X_train, y_train,
            batch_size=batch_size,
            epochs=epochs,
            verbose=1,
            validation_data=(x_test, y_test), 
            callbacks=[mc, es])

    i=0
    while True:
        if not os.path.exists(model_path):
            if i==0:
                print('Waiting h5 model file...')
            if i == 10:
                msg = 'error, check in logs'
                with open(f'result_{nr_of_labeled_examples}.txt', 'a') as f:
                    print(f'Random sampling - Training on {nr_of_labeled_examples} samples\n\tVal. accuracy: {msg}', file=f)
                break
            i = i + 1
            time.sleep(0.0001)
        else: 
            saved_model = load_model(model_path)
            score = saved_model.evaluate(x_test, y_test, verbose=0)
            os.remove(model_path)
            with open(f'result_{nr_of_labeled_examples}.txt', 'a') as f:
                print(f'Random sampling - Training on {nr_of_labeled_examples} samples\n\tVal. accuracy: ', '%.5f' % score[1], file=f)
            break
    
    if verbose >= 1:
        print('Test loss:', score[0])
        print('Test accuracy:', score[1])

    #AL!!
    sampling_methods = [uncertainty_sampling, entropy_sampling, margin_sampling]

    
    for method in sampling_methods:
        with open(f'result_{nr_of_labeled_examples}.txt', 'a') as f:
                print(f'{method.__name__} - # trained samples    - Val. accuracy', file=f)

        (X_train, y_train), (x_test, y_test) = load_proc_data()

        segment = int(nr_of_labeled_examples / 10)
        # create the classifier
        classifier = KerasClassifier(create_model)

        # assemble initial data
        initial_idx = np.random.choice(range(len(X_train)), size=segment, replace=False)
        X_initial = X_train[initial_idx]
        y_initial = y_train[initial_idx]

        # initialize ActiveLearner
        learner = ActiveLearner(
            estimator=classifier,
            query_strategy=method,
            X_training=X_initial, y_training=y_initial,
            verbose=1
        )

        # the active learning loop
        n_queries = 9
        only_new = False # TODO: maybe learn on all data from the beggining, test!!!

        for idx in range(n_queries):
            model_path_al = f'best_model_al_{method.__name__}_{(idx + 2)*segment}.h5'

            mc_al = ModelCheckpoint(model_path_al, monitor='val_accuracy', mode='max', save_best_only=True)
            
            print('Query no. %d' % (idx + 1))
            query_idx, _ = learner.query(X_train, n_instances=segment, verbose=0) #TODO: n_instances param, get it here somehow, or do the process for n times
            learner.teach(
                X=X_train[query_idx], 
                y=y_train[query_idx], 
                only_new=only_new,
                batch_size=batch_size,
                epochs=epochs,
                verbose=1,
                validation_data=(x_test, y_test), 
                callbacks=[mc_al, es]
            )

            i=0
            while True:
                if not os.path.exists(model_path_al):
                    if i==0:
                        print('Waiting h5 model file...')
                    if i==10:
                        msg = 'error, check in logs'
                        with open(f'result_{nr_of_labeled_examples}.txt', 'a') as f:
                            print(f'                            {(idx + 2)*segment}            \t\t{msg}', file=f)
                        break
                    i = i + 1
                    time.sleep(0.01)
                else: 
                    saved_model = load_model(model_path_al)
                    score_al = saved_model.evaluate(x_test, y_test, verbose=0)
                    os.remove(model_path_al)
                    with open(f'result_{nr_of_labeled_examples}.txt', 'a') as f:
                        print(f'                            {(idx + 2)*segment}            \t\t','%.5f' % score_al[1], file=f)
                    break

            # remove queried instance from pool
            X_train = np.delete(X_train, query_idx, axis=0)
            y_train = np.delete(y_train, query_idx, axis=0)

            # score_al = learner.score_al(x_test, y_test)

    with open(f'result_{nr_of_labeled_examples}.txt', 'a') as f:
        print('\n', file=f)

    return
Exemplo n.º 26
0
])
# map the intensity values against the grid
y_full = np.asarray([data[P[0], P[1]] for P in X_full])
X_pool = deepcopy(X_full)
y_pool = deepcopy(y_full)

# assembling initial training set
initial_idx = [
    0, im_height - 1, im_height * (im_height - 1), -1,
    im_width // 2 + im_height // 2 * im_height
]
X_train, y_train = X_pool[initial_idx], y_pool[initial_idx]

# create an ActiveLearner instance
learner = ActiveLearner(estimator=RandomForestClassifier(),
                        X_training=X_train,
                        y_training=y_train)
initial_prediction = learner.predict_proba(X_full)[:, 1].reshape(
    im_height, im_width)

n_queries = 100
for round_idx in range(n_queries):
    query_idx, query_inst = learner.query(X_pool)
    learner.teach(X_pool[query_idx].reshape(1, -1),
                  y_pool[query_idx].reshape(-1, ))
    X_pool = np.delete(X_pool, query_idx, axis=0)
    y_pool = np.delete(y_pool, query_idx)

final_prediction = learner.predict_proba(X_full)[:, 1].reshape(
    im_height, im_width)
def active_learn(df1, first_item_index_of_each_category):
    train_idx = first_item_index_of_each_category
    # X_train = iris['data'][train_idx]
    # y_train = iris['target'][train_idx]

    # initial training data
    data = df1.values[:, 1:]
    target = df1['label'].values

    X_full = df1.values[:, 1:]
    y_full = df1['label'].values

    X_train = df1.values[:, 1:][
        train_idx]  #item from second column as the first column is the label..
    y_train = df1['label'].values[train_idx]

    # with plt.style.context('seaborn-white'):
    #     pca = PCA(n_components=2).fit_transform(data)
    #     plt.figure(figsize=(7, 7))
    #     plt.scatter(x=pca[:, 0], y=pca[:, 1], c=y_train, cmap='viridis', s=50)
    #     plt.title('The iris dataset')
    #     plt.show()

    # generating the pool
    X_pool = np.delete(data, train_idx, axis=0)
    y_pool = np.delete(target, train_idx)

    # initializing the active learner
    learner = ActiveLearner(estimator=RandomForestClassifier(),
                            X_training=X_train,
                            y_training=y_train)

    print('Initial prediction accuracy: %f' % learner.score(X_full, y_full))
    index = 0
    performance_array = []
    # learning until the accuracy reaches a given threshold
    while learner.score(X_full, y_full) < 0.90:
        stream_idx = np.random.choice(range(len(X_full)))
        if classifier_uncertainty(learner, X_full[stream_idx].reshape(
                1, -1)) >= 0.4:
            learner.teach(X_full[stream_idx].reshape(1, -1),
                          y_full[stream_idx].reshape(-1, ))
            learner_score = learner.score(X_full, y_full)
            print('Item no. %d queried, new accuracy: %f' %
                  (stream_idx, learner_score))
            if index == 505:
                break
            if (index % 100 == 0):
                performance_array.append(learner_score)
            index = index + 1
    percentage_increase(performance_array)

    # visualizing initial prediction
    # with plt.style.context('seaborn-white'):
    #     plt.figure(figsize=(7, 7))
    #     prediction = learner.predict(data)
    #     plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50)
    #     plt.title('Initial accuracy: %f' % learner.score(data, target))
    #     plt.show()

    # pool-based sampling
    # n_queries = 502
    # performance_array = []
    # for idx in range(n_queries):
    #     query_idx, query_instance = learner.query(X_pool)
    #     learner.teach(
    #         X=X_pool[query_idx].reshape(1, -1),
    #         y=y_pool[query_idx].reshape(1, )
    #     )
    #     # remove queried instance from pool
    #     X_pool = np.delete(X_pool, query_idx, axis=0)
    #     y_pool = np.delete(y_pool, query_idx)
    #     learner_score = learner.score(data, target)
    #     print('Accuracy after query no. %d: %f' % (idx + 1, learner_score))
    #     if (idx % 100 == 0):
    #         performance_array.append(learner_score)
    #
    # percentage_increase(performance_array)

    # plotting final prediction
    # with plt.style.context('seaborn-white'):
    #     plt.figure(figsize=(7, 7))
    #     prediction = learner.predict(data)
    #     plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50)
    #     plt.title(
    #         'Classification accuracy after %i queries: %f' % (n_queries, learner.score(data,target)))
    #     plt.show()
    y = 0
Exemplo n.º 28
0
train_k = train_k.drop(['animal_name'], axis=1) #dropping the target variable for clustering
print(train_k)

#plotting the data in an understandable form(kmeans)
f, ax = plt.subplots(figsize=(12, 8))
corr = train_k.corr()
hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="summer",fmt='.2f')
f.subplots_adjust(top=.94)
t= f.suptitle('Zoo animals Heatmap', fontsize=16)

kmeans = KMeans(n_clusters=7, max_iter=10000)

X = np.array(train_k.drop(["class_type"], 1).astype(float))
Y = np.array(train_k["class_type"])

learner = ActiveLearner(estimator=kmeans, X_training=X, y_training=Y)

predictions = learner.predict(X_test)

X_pool = np.array(test_k.drop(["class_type"], 1).astype(float))
y_pool = np.array(test_k["class_type"]) - 1

for index in range(N_Queries[0]):
  query_index = random.randrange(0,len(X_pool))
  x, y = X_pool[query_index].reshape(1, -1), y_pool[query_index].reshape(1, )
  learner.teach(X=x, y=y)
  X_pool, y_pool = np.delete(X_pool, query_index, axis=0), np.delete(y_pool, query_index)
    
model_accuracy = learner.score(X, Y)
print('Accuracy: {acc:0.4f} \n'.format(acc=model_accuracy))
Exemplo n.º 29
0
    def review(self):

        # create the pool and training indices.
        n_samples = self.X.shape[0]
        pool_idx = np.arange(n_samples)

        # add prior knowledge
        init_idx, init_labels = self._prior_knowledge()
        self.y[init_idx] = init_labels

        # remove the initial sample from the pool
        pool_idx = np.delete(pool_idx, init_idx)

        # Initialize learner, but don't start training yet.
        self.learner = ActiveLearner(estimator=self.model,
                                     query_strategy=self.query_strategy)
        query_i = 0
        train_idx = init_idx.copy()
        query_idx = train_idx
        self._logger.add_labels(self.y)

        while not self._stop_iter(query_i - 1, pool_idx):
            self._logger.add_training_log(query_idx, self.y[query_idx])

            # Get the training data.
            X_train, y_train = self.train_data(self.X, self.y, train_idx,
                                               **self.balance_kwargs)
            #             validation_data(self.X[pool_idx], self.y[pool_idx],
            #                             self.fit_kwargs, ratio=1)

            # Train the model on the training data.
            self.learner.teach(X=X_train,
                               y=y_train,
                               only_new=True,
                               **self.fit_kwargs)

            # Make a query from the pool.
            query_idx, _ = self.learner.query(X=self.X,
                                              pool_idx=pool_idx,
                                              n_instances=min(
                                                  self.n_instances,
                                                  len(pool_idx)),
                                              query_kwargs=self.query_kwargs)

            # Log the probabilities of samples in the pool being included.
            pred_proba = self.query_kwargs.get('pred_proba', [])
            if len(pred_proba) == 0:
                pred_proba = self.learner.predict_proba(self.X[pool_idx])
            self._logger.add_proba(pool_idx, pred_proba)

            # Log the probabilities of samples that were trained.
            pred_proba_train = self.learner.predict_proba(self.X[train_idx])
            self._logger.add_proba(train_idx,
                                   pred_proba_train,
                                   logname="train_proba")

            # Classify the queried papers.
            self.y[query_idx] = self._classify(query_idx)
            self._logger.add_labels(self.y)

            # Update training/pool indices
            train_idx = np.append(train_idx, query_idx)
            pool_idx = np.delete(np.arange(n_samples), train_idx, axis=0)

            # update the query counter
            query_i += 1

            # Save the result to a file
            if self.log_file:
                self.save_logs(self.log_file)
                if self.verbose:
                    print(f"Saved results in log file: {self.log_file}")
Exemplo n.º 30
0
                s=50)
    plt.title('The iris dataset')
    plt.show()

# initial training data
train_idx = [0, 50, 100]  # index des éléments initiaux du training set
X_train = iris['data'][train_idx]
y_train = iris['target'][train_idx]

# generating the pool
X_pool = np.delete(iris['data'], train_idx, axis=0)
y_pool = np.delete(iris['target'], train_idx)

# initializing the active learner
learner = ActiveLearner(estimator=KNeighborsClassifier(n_neighbors=3),
                        X_training=X_train,
                        y_training=y_train)

# visualizing initial prediction
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(7, 7))
    prediction = learner.predict(iris['data'])
    plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50)
    plt.title('Initial accuracy: %f' %
              learner.score(iris['data'], iris['target']))
    plt.show()

print('Accuracy before active learning: %f' %
      learner.score(iris['data'], iris['target']))

# pool-based sampling
n_labeled_examples = X_raw.shape[0]
training_indices = np.random.randint(low=0,
                                     high=n_labeled_examples + 1,
                                     size=3)

X_train = X_raw[training_indices]
y_train = y_raw[training_indices]

# Isolate the non-training examples we'll be querying.
X_pool = np.delete(X_raw, training_indices, axis=0)
y_pool = np.delete(y_raw, training_indices, axis=0)

# Specify our core estimator along with it's active learning model.
knn = KNeighborsClassifier(n_neighbors=3)
learner = ActiveLearner(estimator=knn, X_training=X_train, y_training=y_train)

predictions = learner.predict(X_raw)
is_correct = (predictions == y_raw)

unqueried_score = learner.score(X_raw, y_raw)

# Plot our classification results.
fig, ax = plt.subplots(figsize=(8.5, 6), dpi=130)
ax.scatter(x=x_component[is_correct],
           y=y_component[is_correct],
           c='g',
           marker='+',
           label='Correct',
           alpha=8 / 10)
ax.scatter(x=x_component[~is_correct],
Exemplo n.º 32
0
X = np.random.choice(np.linspace(0, 20, 10000), size=200, replace=False).reshape(-1, 1)
y = np.sin(X) + np.random.normal(scale=0.3, size=X.shape)

# assembling initial training set
n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
X_initial, y_initial = X[initial_idx], y[initial_idx]

# defining the kernel for the Gaussian process
kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
         + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))

# initializing the active learner
regressor = ActiveLearner(
    predictor=GaussianProcessRegressor(kernel=kernel),
    query_strategy=GP_regression_std,
    X_initial=X_initial.reshape(-1, 1), y_initial=y_initial.reshape(-1, 1)
)

# plotting the initial estimation
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(14, 7))
    x = np.linspace(0, 20, 1000)
    pred, std = regressor.predict(x.reshape(-1,1), return_std=True)
    plt.plot(x, pred)
    plt.fill_between(x, pred.reshape(-1, )-std, pred.reshape(-1, )+std, alpha=0.2)
    plt.scatter(X, y, c='k')
    plt.title('Initial estimation based on %d points' % n_initial)
    plt.show()

# active learning
Exemplo n.º 33
0
# create the data to stream from
X_full = np.transpose(
    [np.tile(np.asarray(range(im.shape[0])), im.shape[1]),
     np.repeat(np.asarray(range(im.shape[1])), im.shape[0])]
)
# map the intensity values against the grid
y_full = np.asarray([im[P[0], P[1]] for P in X_full])

# assembling initial training set
n_initial = 5
initial_idx = np.random.choice(range(len(X_full)), size=n_initial, replace=False)
X_train, y_train = X_full[initial_idx], y_full[initial_idx]

# initialize the learner
learner = ActiveLearner(
    predictor=RandomForestClassifier(),
    X_initial=X_train, y_initial=y_train
)

"""
The instances are randomly selected one by one, if an instance's uncertainty
is above a threshold, the label is requested and shown to the learner. The
process is continued until the learner reaches a previously defined accuracy.
"""

# learning until the accuracy reaches a given threshold
while learner.score(X_full, y_full) < 0.7:
    stream_idx = np.random.choice(range(len(X_full)))
    if classifier_uncertainty(learner, X_full[stream_idx].reshape(1, -1)) >= 0.4:
        learner.teach(X_full[stream_idx].reshape(1, -1), y_full[stream_idx].reshape(-1, ))
Exemplo n.º 34
0
    def learn(self):
        # seeding
        classes = self.short_df['grades_round'].unique()
        seed_index = []
        for i in classes:
            seed_index.append(self.short_df['grades_round'][
                self.short_df['grades_round'] == i].index[0])
        seed_index

        act_data = self.short_df.copy()
        accuracy_list = []
        f1_total_list = []
        kappa_total_list = []

        # initialising
        train_idx = seed_index
        X_train = self.X[train_idx]
        y_train = self.Y[train_idx]

        # generating the pool
        X_pool = np.delete(self.X, train_idx, axis=0)
        y_pool = np.delete(self.Y, train_idx)

        act_data = act_data.drop(axis=0, index=train_idx)
        act_data.reset_index(drop=True, inplace=True)

        initiated_committee = []
        for learner_idx, model in enumerate(self.learners):
            learner = ActiveLearner(estimator=model,
                                    X_training=X_train,
                                    y_training=y_train)
            initiated_committee.append(learner)
        # Commitee creation
        committee = Committee(
            learner_list=initiated_committee,
            #             query_strategy=vote_entropy_sampling
        )

        committee.teach(X_train, y_train)

        # pool-based sampling
        n_queries = int(len(X) / (100 / self.percent))
        for idx in range(n_queries):
            query_idx = np.random.choice(range(len(X_pool)))
            committee.teach(X=X_pool[query_idx].reshape(1, -1),
                            y=y_pool[query_idx].reshape(1, ))

            # remove queried instance from pool
            X_pool = np.delete(X_pool, query_idx, axis=0)
            y_pool = np.delete(y_pool, query_idx)

            act_data = act_data.drop(axis=0, index=query_idx)
            act_data.reset_index(drop=True, inplace=True)

            accuracy_list.append(
                accuracy_score(committee.predict(X_pool), y_pool))

            model_pred = committee.predict(X_pool)
            f1_total_list.append(
                f1_score(y_pool,
                         model_pred,
                         average="weighted",
                         labels=np.unique(model_pred)))
            kappa_total_list.append(cohen_kappa_score(y_pool, model_pred))
#             print('Accuracy after query no. %d: %f' % (idx+1, accuracy_score(committee.predict(X_pool),y_pool)))
# print("By just labelling ",round(n_queries*100.0/len(X),2),"% of total data accuracy of ", round(accuracy_score(committee.predict(X_pool),y_pool),3), " % is achieved on the unseen data" )
        return accuracy_list, f1_total_list, kappa_total_list
Exemplo n.º 35
0
def active_learn(df1, first_item_index_of_each_category):
    train_idx = first_item_index_of_each_category
    # X_train = iris['data'][train_idx]
    # y_train = iris['target'][train_idx]

    # initial training data
    data = df1.values[:,1:]
    target = df1['label'].values

    X_full = df1.values[:, 1:]
    y_full = df1['label'].values


    X_train = df1.values[:,1:][train_idx] #item from second column as the first column is the label..
    y_train = df1['label'].values[train_idx]

    # X_pool = np.delete(data, train_idx, axis=0)
    # y_pool = np.delete(target, train_idx)

    X_pool = deepcopy(X_full)
    y_pool = deepcopy(y_full)

    # initializing Committee members
    n_members = 2
    learner_list = list()

    for member_idx in range(n_members):
        # initial training data
        n_initial = 5
        train_idx = np.random.choice(range(X_pool.shape[0]), size=n_initial, replace=False)
        X_train = X_pool[train_idx]
        y_train = y_pool[train_idx]

        # creating a reduced copy of the data with the known instances removed
        X_pool = np.delete(X_pool, train_idx, axis=0)
        y_pool = np.delete(y_pool, train_idx)

        # initializing learner
        learner = ActiveLearner(
            estimator=RandomForestClassifier(),
            X_training=X_train, y_training=y_train
        )
        learner_list.append(learner)
        # assembling the committee
    committee = Committee(learner_list=learner_list)

    print('Committee initial predictions, accuracy = %1.3f' % committee.score(data, target))

    performance_array = []
    n_queries = 505
    for idx in range(n_queries):
        query_idx, query_instance = committee.query(X_pool)
        committee.teach(
            X=X_pool[query_idx].reshape(1, -1),
            y=y_pool[query_idx].reshape(1, )
        )
        # remove queried instance from pool
        X_pool = np.delete(X_pool, query_idx, axis=0)
        y_pool = np.delete(y_pool, query_idx)
        learner_score = committee.score(data, target)
        print('Committee %d th query predictions, accuracy = %1.3f' % (idx , learner_score))
        if (idx % 100 == 0):
            performance_array.append(learner_score)
    percentage_increase(performance_array)
for member_idx in range(n_members):
    # initial training data
    n_initial = 5
    train_idx = np.random.choice(range(X_pool.shape[0]),
                                 size=n_initial,
                                 replace=False)
    X_train = X_pool[train_idx]
    y_train = y_pool[train_idx]

    # creating a reduced copy of the data with the known instances removed
    X_pool = np.delete(X_pool, train_idx, axis=0)
    y_pool = np.delete(y_pool, train_idx)

    # initializing learner
    learner = ActiveLearner(estimator=RandomForestClassifier(),
                            X_training=X_train,
                            y_training=y_train)
    learner_list.append(learner)

# assembling the committee
committee = Committee(learner_list=learner_list)

# visualizing the initial predictions
# with plt.style.context('seaborn-white'):
#     plt.figure(figsize=(n_members*7, 7))
#     for learner_idx, learner in enumerate(committee):
#         plt.subplot(1, n_members, learner_idx + 1)
#         plt.scatter(x=pca[:, 0], y=pca[:, 1], c=learner.predict(iris['data']), cmap='viridis', s=5)
#         plt.title('Learner no. %d initial predictions' % (learner_idx + 1))
#     plt.show()
Exemplo n.º 37
0
learner_list = list()

for member_idx in range(n_members):
    # initial training data
    n_initial = 5
    train_idx = np.random.choice(range(X_pool.shape[0]), size=n_initial, replace=False)
    X_train = X_pool[train_idx]
    y_train = y_pool[train_idx]

    # creating a reduced copy of the data with the known instances removed
    X_pool = np.delete(X_pool, train_idx, axis=0)
    y_pool = np.delete(y_pool, train_idx)

    # initializing learner
    learner = ActiveLearner(
        predictor=RandomForestClassifier(),
        X_initial=X_train, y_initial=y_train
    )
    learner_list.append(learner)

# assembling the committee
committee = Committee(learner_list=learner_list)

# visualizing the initial predictions
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(n_members*7, 7))
    for learner_idx, learner in enumerate(committee):
        plt.subplot(1, n_members, learner_idx + 1)
        plt.scatter(x=pca[:, 0], y=pca[:, 1], c=learner.predict(iris['data']), cmap='viridis', s=50)
        plt.title('Learner no. %d initial predictions' % (learner_idx + 1))
    plt.show()
    def al_rank(self, data, target, X_train, y_train, X_full, y_full, train_idx, N_RAW_SAMPLES=120, proba = 5):
        acc = []
        pre = []
        rec = []
        fs = []
        BATCH_SIZE = 3
        preset_batch = partial(uncertainty_batch_sampling, n_instances=BATCH_SIZE)

        learner = ActiveLearner(
            estimator=RandomForestClassifier(),

            X_training=X_train,
            y_training=y_train,

            query_strategy=preset_batch
        )

        # N_RAW_SAMPLES = 80
        N_QUERIES = N_RAW_SAMPLES // BATCH_SIZE
        unqueried_score = learner.score(X_full, y_full)
        performance_history = [unqueried_score]

        # Isolate our examples for our labeled dataset.
        n_labeled_examples = X_full.shape[0]
        training_indices = np.random.randint(low=0, high=n_labeled_examples + 1, size=3)

        X_train = X_full[training_indices]
        y_train = y_full[training_indices]

        # Isolate the non-training examples we'll be querying.
        X_pool = np.delete(X_full, training_indices, axis=0)
        y_pool = np.delete(y_full, training_indices, axis=0)


        for index in range(N_QUERIES):
            query_index, query_instance = learner.query(X_pool)

            # Teach our ActiveLearner model the record it has requested.
            X, y = X_pool[query_index], y_pool[query_index]


            labeled_y =y
            rand_int = randint(0, 100)
            if (rand_int <= proba):
                labeled_y = np.array([])
                for idx in query_index:
                    if (y_pool[idx] == 1):
                        y_pool[idx] = 0
                        labeled_y = np.append(labeled_y, 0)
                    else:
                        y_pool[idx] = 1
                        # labeled_y = np.array((1)).reshape(1, )
                        labeled_y = np.append(labeled_y, 1)

            learner.teach(
                X=X,
                y=labeled_y
            )


            # learner.teach(X=X, y=y)

            # Remove the queried instance from the unlabeled pool.
            X_pool = np.delete(X_pool, query_index, axis=0)
            y_pool = np.delete(y_pool, query_index)

            # Calculate and report our model's accuracy.
            model_accuracy = learner.score(X_full, y_full)
            print('Accuracy after query {n}: {acc:0.4f}'.format(n=index + 1, acc=model_accuracy))
            precision, recall, fscore, support, accuracy = self.performance_measure(learner, X_full, y_full)
            learner_score = accuracy
            acc.append(learner_score)
            pre.append(precision)
            rec.append(recall)
            fs.append(fscore)
            # Save our model's performance for plotting.
            performance_history.append(model_accuracy)

        return  acc, pre, rec, fs
Exemplo n.º 39
0
    plt.scatter(x=pca[:, 0], y=pca[:, 1], c=iris['target'], cmap='viridis', s=50)
    plt.title('The iris dataset')
    plt.show()

# initial training data
train_idx = [0, 50, 100]
X_train = iris['data'][train_idx]
y_train = iris['target'][train_idx]

# generating the pool
X_pool = np.delete(iris['data'], train_idx, axis=0)
y_pool = np.delete(iris['target'], train_idx)

# initializing the active learner
learner = ActiveLearner(
    predictor=KNeighborsClassifier(n_neighbors=3),
    X_initial=X_train, y_initial=y_train
)

# visualizing initial prediction
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(7, 7))
    prediction = learner.predict(iris['data'])
    plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50)
    plt.title('Initial accuracy: %f' % learner.score(iris['data'], iris['target']))
    plt.show()

print('Accuracy before active learning: %f' % learner.score(iris['data'], iris['target']))

# pool-based sampling
n_queries = 20
for idx in range(n_queries):
Exemplo n.º 40
0
def GP_regression_std(regressor, X):
    _, std = regressor.predict(X, return_std=True)
    query_idx = np.argmax(std)
    return query_idx, X[query_idx]

# generating the data
X = np.random.choice(np.linspace(0, 20, 10000), size=200, replace=False).reshape(-1, 1)
y = np.sin(X) + np.random.normal(scale=0.3, size=X.shape)

# assembling initial training set
n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
X_initial, y_initial = X[initial_idx], y[initial_idx]

# defining the kernel for the Gaussian process
kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
         + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))

# initializing the active learner
regressor = ActiveLearner(
    predictor=GaussianProcessRegressor(kernel=kernel),
    query_strategy=GP_regression_std,
    X_initial=X_initial.reshape(-1, 1), y_initial=y_initial.reshape(-1, 1)
)

# active learning
n_queries = 10
for idx in range(n_queries):
    query_idx, query_instance = regressor.query(X)
    regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))