def _active_learning_update_metrics( self, active_learner: ActiveLearner, x_dev: np.ndarray, y_dev: Series, stats: Stats, data_for_plotting: List[Stats], i: int, elapsed_train: float, elapsed_query: float, labeled_indices: List[int], semi_sup: bool) -> Tuple[Stats, List[Stats], List[int]]: predicted = active_learner.predict(x_dev) scores = None if semi_sup else active_learner.predict_proba(x_dev)[:, 1] metrics = self._get_metrics(actual=y_dev, predicted=predicted, scores=scores) data_for_plotting.append( self._get_plotting_row(i, metrics, elapsed_train, elapsed_query)) metrics = util.add_prefix_to_dict_keys(metrics, f'sample_{i+1}_') if i + 1 in self.active_learning_log_intervals or i == -1: stats = util.merge_dicts(stats, metrics) return stats, data_for_plotting, labeled_indices
class LearnerD(): def __init__(self): self.is_ready_to_predict = False self.learner = ActiveLearner( estimator=RandomForestClassifier(n_estimators=100), query_strategy=uncertainty_sampling, ) def predict_prob(self, point): if self.is_ready_to_predict is False: print('predict prob abort, learner is not ready to predict') return True, 0 X = point positive_prob = self.learner.predict_proba(X)[0][1] negative_prob = self.learner.predict_proba(X)[0][0] #print('D learner proba', self.learner.predict_proba(X)) return True, positive_prob def update(self, positive_points, negative_points): if len(positive_points) == 0 or len(negative_points) == 0: print('update abort, not enough data to update') self.is_ready_to_predict = False return False X = positive_points[0] y = np.ones(1) for i in range(1, len(positive_points)): X = np.concatenate((X, positive_points[i]), axis=0) y = np.concatenate((y, np.ones(1)), axis=0) for i in range(0, len(negative_points)): X = np.concatenate((X, negative_points[i]), axis=0) y = np.concatenate((y, np.zeros(1)), axis=0) self.learner.fit(X, y) self.is_ready_to_predict = True return True
y_full = np.asarray([data[P[0], P[1]] for P in X_full]) X_pool = deepcopy(X_full) y_pool = deepcopy(y_full) # assembling initial training set initial_idx = [ 0, im_height - 1, im_height * (im_height - 1), -1, im_width // 2 + im_height // 2 * im_height ] X_train, y_train = X_pool[initial_idx], y_pool[initial_idx] # create an ActiveLearner instance learner = ActiveLearner(predictor=RandomForestClassifier(), X_initial=X_train, y_initial=y_train) initial_prediction = learner.predict_proba(X_full)[:, 1].reshape( im_height, im_width) n_queries = 100 for round_idx in range(n_queries): query_idx, query_inst = learner.query(X_pool) learner.teach(X_pool[query_idx].reshape(1, -1), y_pool[query_idx].reshape(-1, )) X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx) final_prediction = learner.predict_proba(X_full)[:, 1].reshape( im_height, im_width) # learning with randomly selected queries instead of active learning random_idx = initial_idx + list( np.random.choice(range(len(X_full)), n_queries, replace=False))
# assembling initial training set n_initial = 5 initial_idx = np.random.choice(range(len(X_full)), size=n_initial, replace=False) X_train, y_train = X_full[initial_idx], y_full[initial_idx] # initialize the learner learner = ActiveLearner( estimator=RandomForestClassifier(), X_training=X_train, y_training=y_train ) print('Initial prediction accuracy: %f' % learner.score(X_full, y_full)) # visualizing initial prediciton with plt.style.context('seaborn-white'): plt.figure(figsize=(7, 7)) prediction = learner.predict_proba(X_full)[:, 1] plt.imshow(prediction.reshape(im_width, im_height)) plt.title('Initial prediction accuracy: %f' % learner.score(X_full, y_full)) plt.show() """ The instances are randomly selected one by one, if an instance's uncertainty is above a threshold, the label is requested and shown to the learner. The process is continued until the learner reaches a previously defined accuracy. """ # learning until the accuracy reaches a given threshold while learner.score(X_full, y_full) < 0.90: stream_idx = np.random.choice(range(len(X_full))) if classifier_uncertainty(learner, X_full[stream_idx].reshape(1, -1)) >= 0.4: learner.teach(X_full[stream_idx].reshape(1, -1), y_full[stream_idx].reshape(-1, ))
def get_AL_predict(test_feature, choose_feature, unlabel_feature, test_query, choose_query, choose_answer, unlabel_query, unlabel_answer, rec_api_test, rec_api_choose, rec_api_unlabel, w2v, idf): unlabel_feedback_info = feedback.get_feedback_inf(unlabel_query, choose_query, choose_answer, rec_api_unlabel, w2v, idf) label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf) X_train, y_train = braid_AL.get_active_data(unlabel_feedback_info, unlabel_feature) X_feedback, y_feedback = braid_AL.get_active_data(label_feedback_info, choose_feature) # initializing the active learner learner = ActiveLearner( estimator=KNeighborsClassifier(n_neighbors=4), # estimator=LogisticRegression(penalty='l1', solver='liblinear'), X_training=X_feedback, y_training=y_feedback ) length = len(rec_api_test) predict, sel_query, add_unlabel_feature = [], [], [] if len(unlabel_query) > 0: # pool-based sampling n_queries = 40 for idx in range(n_queries): query_idx, query_instance = uncertainty_sampling(classifier=learner, X=X_train) idx = int(query_idx/10) learner.teach( X=X_train[query_idx].reshape(1, -1), y=y_train[query_idx].reshape(1, ) ) # add queried instance into FR choose_query.append(unlabel_query[idx]) choose_answer.append(unlabel_answer[idx]) rec_api_choose.extend(rec_api_unlabel[idx*10:idx*10+10]) choose_feature.extend(unlabel_feature[idx*10:idx*10+10]) # remove queried instance from pool for i in range(10): X_train = np.delete(X_train, idx*10, axis=0) y_train = np.delete(y_train, idx*10) del unlabel_query[idx] del unlabel_answer[idx] del rec_api_unlabel[idx*10:idx*10+10] del unlabel_feature[idx*10:idx*10+10] if len(X_train) == 0: break add_label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf) new_X_feedback, new_y_feedback = braid_AL.get_active_data(add_label_feedback_info, choose_feature) learner = ActiveLearner( estimator=KNeighborsClassifier(n_neighbors=4), # estimator=LogisticRegression(penalty='l1', solver='liblinear'), X_training=new_X_feedback, y_training=new_y_feedback ) feedback_info = feedback.get_feedback_inf(test_query, choose_query, choose_answer, rec_api_test, w2v, idf) X = split_data.get_test_feature_matrix(feedback_info, test_feature) X_test = np.array(X) # 用反馈数据学习过后的模型来预测测试数据 for query_idx in range(length): try: y_pre = learner.predict_proba(X=X_test[query_idx].reshape(1, -1)) except ValueError: predict = [0.0 for n in range(length)] else: predict.append(float(y_pre[0, 1])) return predict, X, new_X_feedback, new_y_feedback
def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = 'binary', p_subsample: np.float = 1.0, n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]: """ Expected error reduction query strategy. References: Roy and McCallum, 2001 (http://groups.csail.mit.edu/rrg/papers/icml01.pdf) Args: learner: The ActiveLearner object for which the expected error is to be estimated. X: The samples. loss: The loss function to be used. Can be 'binary' or 'log'. p_subsample: Probability of keeping a sample from the pool when calculating expected error. Significantly improves runtime for large sample pools. n_instances: The number of instances to be sampled. random_tie_break: If True, shuffles utility scores to randomize the order. This can be used to break the tie when the highest utility score is not unique. Returns: The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. """ assert 0.0 <= p_subsample <= 1.0, 'p_subsample subsampling keep ratio must be between 0.0 and 1.0' assert loss in ['binary', 'log'], 'loss must be \'binary\' or \'log\'' expected_error = np.zeros(shape=(len(X), )) possible_labels = np.unique(learner.y_training) try: X_proba = learner.predict_proba(X) except NotFittedError: # TODO: implement a proper cold-start return 0, X[0] cloned_estimator = clone(learner.estimator) for x_idx, x in enumerate(X): # subsample the data if needed if np.random.rand() <= p_subsample: # estimate the expected error for y_idx, y in enumerate(possible_labels): X_new = data_vstack((learner.X_training, x.reshape(1, -1))) y_new = data_vstack((learner.y_training, np.array(y).reshape(1, ))) cloned_estimator.fit(X_new, y_new) refitted_proba = cloned_estimator.predict_proba(X) if loss is 'binary': loss = _proba_uncertainty(refitted_proba) elif loss is 'log': loss = _proba_entropy(refitted_proba) expected_error[x_idx] += np.sum(loss)*X_proba[x_idx, y_idx] else: expected_error[x_idx] = np.inf if not random_tie_break: query_idx = multi_argmax(expected_error, n_instances) else: query_idx = shuffled_argmax(expected_error, n_instances) return query_idx, X[query_idx]
class Review(ABC): """Base class for Systematic Review""" def __init__(self, X, y=None, model=None, query_strategy=None, train_data_fn=full_sample, n_instances=1, n_queries=None, prior_included=[], prior_excluded=[], log_file=None, settings={}, verbose=1): super(Review, self).__init__() self.X = X self.y = y self.model = model self.query_strategy = query_strategy self.train_data = train_data_fn self.n_instances = n_instances self.n_queries = n_queries self.log_file = log_file self.verbose = verbose self.prior_included = prior_included self.prior_excluded = prior_excluded self.fit_kwargs = settings['fit_kwargs'] self.balance_kwargs = settings['balance_kwargs'] self.query_kwargs = settings['query_kwargs'] self._logger = Logger() @abstractmethod def _prior_knowledge(self): pass @abstractmethod def _classify(self, ind): """Classify the provided indices.""" pass def _prior_teach(self): """Function called before training model.""" pass def _stop_iter(self, query_i, pool): """Criteria for stopping iteration. Stop iterating if: - n_queries is reached - the pool is empty """ stop_iter = False # if the pool is empty, always stop if len(pool) == 0: stop_iter = True # don't stop if there is no stopping criteria if self.n_queries is not None and query_i >= self.n_queries: stop_iter = True return stop_iter def review(self): # create the pool and training indices. n_samples = self.X.shape[0] pool_idx = np.arange(n_samples) # add prior knowledge init_idx, init_labels = self._prior_knowledge() self.y[init_idx] = init_labels # remove the initial sample from the pool pool_idx = np.delete(pool_idx, init_idx) # Initialize learner, but don't start training yet. self.learner = ActiveLearner(estimator=self.model, query_strategy=self.query_strategy) query_i = 0 train_idx = init_idx.copy() query_idx = train_idx self._logger.add_labels(self.y) while not self._stop_iter(query_i - 1, pool_idx): self._logger.add_training_log(query_idx, self.y[query_idx]) # Get the training data. X_train, y_train = self.train_data(self.X, self.y, train_idx, **self.balance_kwargs) # validation_data(self.X[pool_idx], self.y[pool_idx], # self.fit_kwargs, ratio=1) # Train the model on the training data. self.learner.teach(X=X_train, y=y_train, only_new=True, **self.fit_kwargs) # Make a query from the pool. query_idx, _ = self.learner.query(X=self.X, pool_idx=pool_idx, n_instances=min( self.n_instances, len(pool_idx)), query_kwargs=self.query_kwargs) # Log the probabilities of samples in the pool being included. pred_proba = self.query_kwargs.get('pred_proba', []) if len(pred_proba) == 0: pred_proba = self.learner.predict_proba(self.X[pool_idx]) self._logger.add_proba(pool_idx, pred_proba) # Log the probabilities of samples that were trained. pred_proba_train = self.learner.predict_proba(self.X[train_idx]) self._logger.add_proba(train_idx, pred_proba_train, logname="train_proba") # Classify the queried papers. self.y[query_idx] = self._classify(query_idx) self._logger.add_labels(self.y) # Update training/pool indices train_idx = np.append(train_idx, query_idx) pool_idx = np.delete(np.arange(n_samples), train_idx, axis=0) # update the query counter query_i += 1 # Save the result to a file if self.log_file: self.save_logs(self.log_file) if self.verbose: print(f"Saved results in log file: {self.log_file}") def save_logs(self, *args, **kwargs): """Save the logs to a file.""" self._logger.save(*args, **kwargs)
def get_AL_predict(test_feature, choose_feature, unlabel_feature, test_query, choose_query, choose_answer, unlabel_query, unlabel_answer, rec_api_test, rec_api_choose, rec_api_unlabel, w2v, idf): unlabel_feedback_info = feedback.get_feedback_inf(unlabel_query, choose_query, choose_answer, rec_api_unlabel, w2v, idf) label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf) X_train, y_train = get_active_data(unlabel_feedback_info, unlabel_feature) X_feedback, y_feedback = get_active_data(label_feedback_info, choose_feature) # initializing the active learner learner = ActiveLearner( # estimator=KNeighborsClassifier(n_neighbors=4), estimator=LogisticRegression(penalty='l1', solver='liblinear'), X_training=X_feedback, y_training=y_feedback ) predict, sel_query, add_unlabel_feature = [], [], [] if len(unlabel_query) > 0: # pool-based sampling n_queries = 100 sel_idx, sel_label = [], [] for idx in range(n_queries): # query_idx, query_instance = learner.query(X=X_train) query_idx, query_instance = uncertainty_sampling(classifier=learner, X=X_train) idx = int(query_idx/10) # print(idx, len(X_train)) # print('uncertain', query_idx, X_train[query_idx], y_train[query_idx]) learner.teach( X=X_train[query_idx].reshape(1, -1), y=y_train[query_idx].reshape(1, ) ) # add queried instance into FR choose_query.append(unlabel_query[idx]) choose_answer.append(unlabel_answer[idx]) rec_api_choose.extend(rec_api_unlabel[idx*10:idx*10+10]) choose_feature.extend(unlabel_feature[idx*10:idx*10+10]) # learner.teach( # X=new_X_train.reshape(1, -1), # y=new_y_train.reshape(1, ) # ) # print(unlabel_query[idx], unlabel_query[idx], rec_api_unlabel[idx*10:idx*10+10], rec_api_unlabel[idx*10:idx*10+10]) # remove queried instance from pool for i in range(10): X_train = np.delete(X_train, idx*10, axis=0) y_train = np.delete(y_train, idx*10) del unlabel_query[idx] del unlabel_answer[idx] del rec_api_unlabel[idx*10:idx*10+10] del unlabel_feature[idx*10:idx*10+10] if len(X_train) == 0: break add_label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf) new_X_feedback, new_y_feedback = get_active_data(add_label_feedback_info, choose_feature) learner = ActiveLearner( # estimator=KNeighborsClassifier(n_neighbors=4), estimator=LogisticRegression(penalty='l1', solver='liblinear'), X_training=new_X_feedback, y_training=new_y_feedback ) feedback_info = feedback.get_feedback_inf(test_query, choose_query, choose_answer, rec_api_test, w2v, idf) X = split_data.get_test_feature_matrix(feedback_info, test_feature) X_test = np.array(X) # 用反馈数据学习过后的模型来预测测试数据 for query_idx in range(400): y_pre = learner.predict_proba(X=X_test[query_idx].reshape(1, -1)) predict.append(float(y_pre[0, 1])) # predict.append(math.log(float(y_pre[0, 1])+1)) # predict.extend(y_pre.tolist()) x = X_test[query_idx].reshape(1, -1) # print(predict) # print('new_choose', len(choose_query), len(choose_answer)) # fw = open('../data/add_FR.csv', 'a+', newline='') # writer = csv.writer(fw) # for i, fr_q in enumerate(choose_query): # writer.writerow((fr_q, choose_answer[i])) # fw.close() return predict, X, new_X_feedback, new_y_feedback #sorted(sel_query)
class BaseReview(ABC): """Base class for Systematic Review""" def __init__(self, X, y=None, model=None, query_strategy=max_sampling, train_data_fn=full_sample, n_instances=1, n_queries=1, prior_included=[], prior_excluded=[], log_file=None, fit_kwargs={}, balance_kwargs={}, query_kwargs={}, logger=None, verbose=1): super(BaseReview, self).__init__() self.X = X self.y = y if y is None: self.y = np.full(X.shape[0], NOT_AVAILABLE) # Default to Naive Bayes model if model is None: print("Warning: using naive Bayes model as default." "If you experience bad performance, read the documentation" " in order to implement a RNN based solution.") from asreview.models import create_nb_model model = create_nb_model() self.model = model self.query_strategy = query_strategy self.train_data = train_data_fn self.n_instances = n_instances self.n_queries = n_queries self.log_file = log_file self.verbose = verbose self.prior_included = prior_included self.prior_excluded = prior_excluded self.fit_kwargs = fit_kwargs self.balance_kwargs = balance_kwargs self.query_kwargs = query_kwargs self.query_i = 0 self.train_idx = np.array([], dtype=np.int) self.model_trained = False self.query_kwargs["src_query_idx"] = {} if logger is None: self._logger = Logger() self.start_from_logger = False else: self._logger = logger self._prepare_with_logger() self.start_from_logger = True # Initialize learner, but don't start training yet. self.learner = ActiveLearner(estimator=self.model, query_strategy=self.query_strategy) @classmethod def from_logger(cls, *args, **kwargs): reviewer = cls(*args, **kwargs) reviewer._prepare_with_logger() return reviewer @abstractmethod def _prior_knowledge(self): pass @abstractmethod def _get_labels(self, ind): """Classify the provided indices.""" pass def _prior_teach(self): """Function called before training model.""" pass def _stop_iter(self, query_i, n_pool): """Criteria for stopping iteration. Stop iterating if: - n_queries is reached - the pool is empty """ stop_iter = False # if the pool is empty, always stop if n_pool == 0: stop_iter = True # don't stop if there is no stopping criteria if self.n_queries is not None and query_i >= self.n_queries: stop_iter = True return stop_iter def _prepare_with_logger(self): """ If we start the reviewer from a log file, we need to do some preparation work. The final result should be a log dictionary in a state where the labeled papares are one step ahead of the probabilities. Any excess probabilities (pool_proba and train_proba) are thrown away and recomputed. Returns ------- tuple: The query index, training indices and pool_indices. """ query_i = 0 train_idx = [] if "labels" in self._logger._log_dict: self.y = np.array(self._logger._log_dict["labels"]) qk = query_key(query_i) # Capture the labelled indices from the log file. while qk in self._logger._log_dict: new_labels = self._logger._log_dict[qk]["labelled"] label_idx = [x[0] for x in new_labels] inclusions = [x[1] for x in new_labels] self.y[label_idx] = inclusions train_idx.extend(label_idx) query_i += 1 qk = query_key(query_i) query_i -= 1 # Throw away the last probabilities if they have the same key # as the query. These values should be overwritten, since we're # starting out by training the model again. if query_i >= 0: qk = query_key(query_i) self._logger._log_dict[qk].pop("pool_proba", None) self._logger._log_dict[qk].pop("train_proba", None) self.train_idx = np.array(train_idx, dtype=np.int) self.query_i = query_i self.query_kwargs["src_query_idx"] = self._logger.get_src_query_idx() def review(self, stop_after_class=True): """ Do the systematic review, writing the results to the log file. """ if not self.start_from_logger: # add prior knowledge init_idx, init_labels = self._prior_knowledge() self.y[init_idx] = init_labels self.query_i = 0 self.train_idx = init_idx.copy() self._logger.add_labels(self.y) self.query_kwargs['last_bounds'] = [("random", 0, len(init_idx))] self.log_query(init_idx) # train the algorithm with prior knowledge self.train() if self.model_trained: self.log_probabilities() n_pool = self.X.shape[0] - len(self.train_idx) while not self._stop_iter(self.query_i - 1, n_pool): # STEP 1: Make a new query query_idx = self.query(n_instances=min(self.n_instances, n_pool)) # STEP 2: Classify the queried papers. self.y[query_idx] = self._get_labels(query_idx) self._logger.add_labels(self.y) # STEP 3: Run inference (if necessary) and log the probabilities of # the model. self.train_idx = np.append(self.train_idx, query_idx) self.log_query(query_idx) # Option to stop after the classification set instead of training. if stop_after_class and self._stop_iter(self.query_i, n_pool): return # STEP 4: Train the algorithm with new data # Update the training data and pool afterwards self.train() if self.model_trained: self.log_probabilities() # STEP 5: Write all results to the logger # Update the query counter self.query_i += 1 n_pool = self.X.shape[0] - len(self.train_idx) # Save the result to a file if self.log_file: self.save_logs(self.log_file) if self.verbose: print(f"Saved results in log file: {self.log_file}") def log_probabilities(self): pool_idx = get_pool_idx(self.X, self.train_idx) # Log the probabilities of samples in the pool being included. pred_proba = self.query_kwargs.get('pred_proba', np.array([])) if len(pred_proba) == 0: pred_proba = self.learner.predict_proba(self.X) self._logger.add_proba(pool_idx, pred_proba[pool_idx]) # Log the probabilities of samples that were trained. self._logger.add_proba(self.train_idx, pred_proba[self.train_idx], logname="train_proba") def log_query(self, query_idx): self._logger.add_training_log(query_idx, self.y[query_idx]) self._logger.add_query_info(self.query_kwargs) def query(self, n_instances): """Query new results.""" pool_idx = get_pool_idx(self.X, self.train_idx) n_instances = min(n_instances, len(pool_idx)) if not self.model_trained: query_idx = pool_idx[np.random.choice(len(pool_idx), n_instances)] else: # Make a query from the pool. query_idx, _ = self.learner.query(X=self.X, pool_idx=pool_idx, n_instances=n_instances, query_kwargs=self.query_kwargs) return query_idx def classify(self, query_idx, inclusions): self.y[query_idx] = inclusions self.train_idx = np.unique(np.append(self.train_idx, query_idx)) self._logger.add_training_log(query_idx, inclusions) def train(self): """Teach the algorithm with new data.""" num_zero = np.count_nonzero(self.y == 0) num_one = np.count_nonzero(self.y == 1) if num_zero == 0 or num_one == 0: return # Get the training data. X_train, y_train = self.train_data(self.X, self.y, self.train_idx, **self.balance_kwargs) # Train the model on the training data. self.learner.teach(X=X_train, y=y_train, only_new=True, **self.fit_kwargs) self.query_kwargs["pred_proba"] = self.learner.predict_proba(self.X) self.model_trained = True def save_logs(self, *args, **kwargs): """Save the logs to a file.""" self._logger.save(*args, **kwargs) def to_pickle(self, pickle_fp): try: with open(pickle_fp, "wb") as fp: dill.dump(self, fp) except TypeError: model_fp = os.path.splitext(pickle_fp)[0] + ".h5" self.model.model.save(model_fp) current_model = self.model.__dict__.pop("model", None) with open(pickle_fp, "wb") as fp: dill.dump(self, fp) setattr(self.model, "model", current_model) @classmethod def from_pickle(cls, pickle_fp): with open(pickle_fp, "rb") as fp: my_instance = dill.load(fp) try: model_fp = os.path.splitext(pickle_fp)[0] + ".h5" current_model = load_model(model_fp) setattr(my_instance.model, "model", current_model) except BaseException: pass return my_instance
# assembling initial training set n_initial = 5 initial_idx = np.random.choice(range(len(X_full)), size=n_initial, replace=False) X_train, y_train = X_full[initial_idx], y_full[initial_idx] # initialize the learner learner = ActiveLearner( predictor=RandomForestClassifier(), X_initial=X_train, y_initial=y_train ) print('Initial prediction accuracy: %f' % learner.score(X_full, y_full)) # visualizing initial prediciton with plt.style.context('seaborn-white'): plt.figure(figsize=(7, 7)) prediction = learner.predict_proba(X_full)[:, 1] plt.imshow(prediction.reshape(im_width, im_height)) plt.title('Initial prediction accuracy: %f' % learner.score(X_full, y_full)) plt.show() """ The instances are randomly selected one by one, if an instance's uncertainty is above a threshold, the label is requested and shown to the learner. The process is continued until the learner reaches a previously defined accuracy. """ # learning until the accuracy reaches a given threshold while learner.score(X_full, y_full) < 0.90: stream_idx = np.random.choice(range(len(X_full))) if classifier_uncertainty(learner, X_full[stream_idx].reshape(1, -1)) >= 0.4: learner.teach(X_full[stream_idx].reshape(1, -1), y_full[stream_idx].reshape(-1, ))
) # map the intensity values against the grid y_full = np.asarray([data[P[0], P[1]] for P in X_full]) X_pool = deepcopy(X_full) y_pool = deepcopy(y_full) # assembling initial training set initial_idx = [0, im_height-1, im_height*(im_height-1), -1, im_width//2 + im_height//2*im_height] X_train, y_train = X_pool[initial_idx], y_pool[initial_idx] # create an ActiveLearner instance learner = ActiveLearner( predictor=RandomForestClassifier(), X_initial=X_train, y_initial=y_train ) initial_prediction = learner.predict_proba(X_full)[:, 1].reshape(im_height, im_width) n_queries = 100 for round_idx in range(n_queries): query_idx, query_inst = learner.query(X_pool) learner.teach(X_pool[query_idx].reshape(1, -1), y_pool[query_idx].reshape(-1, )) X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx) final_prediction = learner.predict_proba(X_full)[:, 1].reshape(im_height, im_width) # learning with randomly selected queries instead of active learning random_idx = initial_idx + list(np.random.choice(range(len(X_full)), n_queries, replace=False)) X_train, y_train = X_full[initial_idx], y_full[initial_idx] random_learner = ActiveLearner( predictor=RandomForestClassifier(),
for index in range(N_QUERIES): query_index, query_instance = learner.query(X_pool) X, y = X_pool[query_index].reshape(1, -1), y_pool[query_index].reshape(1, ) learner.teach(X=X, y=y) X_pool, y_pool = np.delete(X_pool, query_index, axis=0), np.delete(y_pool, query_index) model_accuracy = learner.score(X_raw, y_raw) print('Accuracy after query {n}: {acc:0.4f}'.format(n=index + 1, acc=model_accuracy)) performance_history.append(model_accuracy) q_index.append(int(query_index)) result = [] for i in range(len(X_pool)): data_for_prediction_array = (X_pool[i].reshape(1, -1)) result.append(learner.predict_proba(data_for_prediction_array)) con = [] con_1 = [] for i in range(len(result)): if (result[i][0][0] > result[i][0][1]): con.append(result[i][0][0]) con_1.append(result[i][0][1]) else: con.append(result[i][0][1]) con_1.append(result[i][0][1]) all = [i for i in list(all) if i not in list(training_indices)] for i in q_index: del all[i] data = { 'Attack-Stage': df.iloc[all, 0], 'Port-Service': df.iloc[all, 1],
class BaseReview(ABC): """Base class for Systematic Review""" def __init__(self, X, y=None, model=None, query_strategy=max_sampling, train_data_fn=full_sample, n_papers=None, n_instances=DEFAULT_N_INSTANCES, n_queries=None, prior_included=[], prior_excluded=[], log_file=None, fit_kwargs={}, balance_kwargs={}, query_kwargs={}, logger=None, verbose=1): super(BaseReview, self).__init__() self.X = X self.y = y if y is None: self.y = np.full(X.shape[0], NOT_AVAILABLE) self.y = np.array(self.y, dtype=np.int) # Default to Naive Bayes model if model is None: print("Warning: using naive Bayes model as default." "If you experience bad performance, read the documentation" " in order to implement a RNN based solution.") from asreview.models import create_nb_model model = create_nb_model() self.model = model self.query_strategy = query_strategy self.train_data = train_data_fn self.n_papers = n_papers self.n_instances = n_instances self.n_queries = n_queries self.log_file = log_file self.verbose = verbose self.prior_included = prior_included self.prior_excluded = prior_excluded self.fit_kwargs = fit_kwargs self.balance_kwargs = balance_kwargs self.query_kwargs = query_kwargs self.query_i = 0 self.train_idx = np.array([], dtype=np.int) self.model_trained = False self.query_kwargs["query_src"] = {} self.query_kwargs["current_queries"] = {} if logger is None: self._logger = Logger() self.start_from_logger = False else: self._logger = logger self._prepare_with_logger() self.start_from_logger = True # Initialize learner, but don't start training yet. self.learner = ActiveLearner(estimator=self.model, query_strategy=self.query_strategy) if not self.start_from_logger: # add prior knowledge init_idx, init_labels = self._prior_knowledge() self.query_i = 0 self.train_idx = np.array([], dtype=np.int) self.classify(init_idx, init_labels, method="initial") @classmethod def from_logger(cls, *args, **kwargs): reviewer = cls(*args, **kwargs) reviewer._prepare_with_logger() return reviewer @abstractmethod def _prior_knowledge(self): pass @abstractmethod def _get_labels(self, ind): """Classify the provided indices.""" pass def _prior_teach(self): """Function called before training model.""" pass def _stop_iter(self, query_i, n_pool): """Criteria for stopping iteration. Stop iterating if: - n_queries is reached - the pool is empty """ stop_iter = False n_train = self.X.shape[0] - n_pool # if the pool is empty, always stop if n_pool == 0: stop_iter = True # If we are exceeding the number of papers, stop. if self.n_papers is not None and n_train >= self.n_papers: stop_iter = True # don't stop if there is no stopping criteria if self.n_queries is not None and query_i >= self.n_queries: stop_iter = True return stop_iter def n_pool(self): return self.X.shape[0] - len(self.train_idx) def _next_n_instances(self): # Could be merged with _stop_iter someday. """ Get the batch size for the next query. """ n_instances = self.n_instances n_pool = self.n_pool() n_instances = min(n_instances, n_pool) if self.n_papers is not None: papers_left = self.n_papers - len(self.train_idx) n_instances = min(n_instances, papers_left) return n_instances def _prepare_with_logger(self): """ If we start the reviewer from a log file, we need to do some preparation work. The final result should be a log dictionary in a state where the labeled papares are one step ahead of the probabilities. Any excess probabilities (pool_proba and train_proba) are thrown away and recomputed. Returns ------- tuple: The query index, training indices and pool_indices. """ query_i = 0 train_idx = [] if "labels" in self._logger._log_dict: self.y = np.array(self._logger._log_dict["labels"], dtype=np.int) qk = query_key(query_i) # Capture the labelled indices from the log file. while qk in self._logger._log_dict: if "labelled" not in self._logger._log_dict[qk]: query_i += 1 qk = query_key(query_i) continue new_labels = self._logger._log_dict[qk]["labelled"] label_methods = self._logger._log_dict[qk]["label_methods"] label_idx = [x[0] for x in new_labels] inclusions = [x[1] for x in new_labels] self.y[label_idx] = inclusions train_idx.extend(label_idx) # Update the internal query sources. start_idx = 0 for method in label_methods: if method[0] not in self.query_kwargs["query_src"]: self.query_kwargs["query_src"][method[0]] = [] self.query_kwargs["query_src"][method[0]].extend( label_idx[start_idx:start_idx + method[1]]) start_idx += method[1] query_i += 1 qk = query_key(query_i) query_i -= 1 if query_i > 0: qk = query_key(query_i) if "labelled" not in self._logger._log_dict[qk]: query_i -= 1 self.train_idx = np.array(train_idx, dtype=np.int) self.query_i = query_i def review(self, stop_after_class=True, instant_save=False): """ Do the systematic review, writing the results to the log file. """ if self._stop_iter(self.query_i, self.n_pool()): return # train the algorithm with prior knowledge self.train() if self.model_trained: self.log_probabilities() if self.log_file: self.save_logs(self.log_file) n_pool = self.X.shape[0] - len(self.train_idx) while not self._stop_iter(self.query_i - 1, n_pool): # STEP 1: Make a new query query_idx = self.query(n_instances=self._next_n_instances()) # STEP 2: Classify the queried papers. if instant_save: for idx in query_idx: idx_array = np.array([idx], dtype=np.int) self.classify(idx_array, self._get_labels(idx_array)) else: self.classify(query_idx, self._get_labels(query_idx)) # Option to stop after the classification set instead of training. if stop_after_class and self._stop_iter(self.query_i, self.n_pool()): if self.log_file: self.save_logs(self.log_file) if self.verbose: print(f"Saved results in log file: {self.log_file}") return # STEP 3: Train the algorithm with new data # Update the training data and pool afterwards self.train() if self.model_trained: self.log_probabilities() # STEP 4: Save the logs. if self.log_file: self.save_logs(self.log_file) if self.verbose: print(f"Saved results in log file: {self.log_file}") def log_probabilities(self): """ Store the modeling probabilities of the training indices and pool indices. """ pool_idx = get_pool_idx(self.X, self.train_idx) # Log the probabilities of samples in the pool being included. pred_proba = self.query_kwargs.get('pred_proba', np.array([])) if len(pred_proba) == 0: pred_proba = self.learner.predict_proba(self.X) self._logger.add_proba(pool_idx, pred_proba[pool_idx], logname="pool_proba", i=self.query_i) # Log the probabilities of samples that were trained. self._logger.add_proba(self.train_idx, pred_proba[self.train_idx], logname="train_proba", i=self.query_i) def query(self, n_instances): """Query new results.""" pool_idx = get_pool_idx(self.X, self.train_idx) n_instances = min(n_instances, len(pool_idx)) # If the model is not trained, choose random papers. if not self.model_trained: query_idx, _ = random_sampling(None, X=self.X, pool_idx=pool_idx, n_instances=n_instances, query_kwargs=self.query_kwargs) else: # Make a query from the pool. query_idx, _ = self.learner.query(X=self.X, pool_idx=pool_idx, n_instances=n_instances, query_kwargs=self.query_kwargs) return query_idx def classify(self, query_idx, inclusions, method=None): """ Classify new papers and update the training indices. """ query_idx = np.array(query_idx, dtype=np.int) self.y[query_idx] = inclusions query_idx = query_idx[np.isin(query_idx, self.train_idx, invert=True)] self.train_idx = np.append(self.train_idx, query_idx) if method is None: methods = [] for idx in query_idx: method = self.query_kwargs["current_queries"].pop(idx, None) if method is None: method = "unknown" methods.append([idx, method]) if method in self.query_kwargs["query_src"]: self.query_kwargs["query_src"][method].append(idx) else: self.query_kwargs["query_src"][method] = [idx] else: methods = [[idx, method] for idx in query_idx] if method in self.query_kwargs["query_src"]: self.query_kwargs["query_src"][method].extend( query_idx.tolist()) else: self.query_kwargs["query_src"][method] = query_idx.tolist() self._logger.add_classification(query_idx, inclusions, methods=methods, i=self.query_i) self._logger.add_labels(self.y) def train(self): """ Train the model. """ num_zero = np.count_nonzero(self.y[self.train_idx] == 0) num_one = np.count_nonzero(self.y[self.train_idx] == 1) if num_zero == 0 or num_one == 0: return # Get the training data. X_train, y_train = self.train_data(self.X, self.y, self.train_idx, **self.balance_kwargs) # Train the model on the training data. self.learner.teach(X=X_train, y=y_train, only_new=True, **self.fit_kwargs) self.query_kwargs["pred_proba"] = self.learner.predict_proba(self.X) self.model_trained = True self.query_i += 1 def statistics(self): n_initial = 0 try: initial_meth = self._logger._log_dict["0"]["label_methods"][0] if initial_meth[0] == "initial": n_initial = initial_meth[1] except (IndexError, KeyError): pass try: if np.count_nonzero(self.y[self.train_idx[n_initial:]] == 1) == 0: last_inclusion = len(self.train_idx[n_initial:]) else: last_inclusion = np.nonzero( self.y[self.train_idx[n_initial:]][::-1] == 1)[0][0] except ValueError: last_inclusion = 0 # last_inclusion = len(self.train_idx)-last_one_pos-1 stats = { "n_included": np.count_nonzero(self.y[self.train_idx] == 1), "n_excluded": np.count_nonzero(self.y[self.train_idx] == 0), "n_papers": len(self.y), "n_reviewed": len(self.train_idx), "n_pool": self.n_pool(), "last_inclusion": last_inclusion, "n_initial": n_initial, } return stats def save_logs(self, *args, **kwargs): """Save the logs to a file.""" self._logger.save(*args, **kwargs) def save(self, pickle_fp): """ Dump the self object to a pickle fill (using dill). Keras models Cannot be dumped, so they are written to a separate h5 file. The model is briefly popped out of the object to allow the rest to be written to a file. Do not rely on this method for long term storage of the class, since library changes could easily break it. In those cases, use the log + h5 file instead. """ if isinstance(self.model, KerasClassifier) and self.model_trained: model_fp = os.path.splitext(pickle_fp)[0] + ".h5" self.model.model.save(model_fp) current_model = self.model.__dict__.pop("model", None) with open(pickle_fp, "wb") as fp: dill.dump(self, fp) setattr(self.model, "model", current_model) else: dill.dump(self, fp) @classmethod def load(cls, pickle_fp): """ Create a BaseReview object from a pickle file, and optiona h5 file. """ with open(pickle_fp, "rb") as fp: my_instance = dill.load(fp) try: model_fp = os.path.splitext(pickle_fp)[0] + ".h5" current_model = load_model(model_fp) setattr(my_instance.model, "model", current_model) except Exception: pass return my_instance
x_seed_vec = vec.transform(x_seed) x_pool_vec = vec.transform(x_pool) #Build learner #pipe = Pipeline([('tfidf', TfidfVectorizer(stop_words='english',ngram_range=(1,1),tokenizer=tokenize)), # ('model',LogisticRegression(C=1))]) learner = ActiveLearner(estimator=LogisticRegression(C=10,solver='lbfgs'), X_training=x_seed_vec, y_training=np.array(y_seed).reshape(len(y_seed),-1)) for n in range(5): query_idx, query_inst = learner.query(x_pool_vec) recipe = x_pool[query_idx].index.values[0] print(recipes[recipe]['recipe']) print('0/1?') response = int(input()) print('\n') learner.teach(x_pool_vec[query_idx].reshape(1, -1), np.array(response).reshape(1, -1)) #Return recommeded recipe pred = learner.predict_proba(x_pool_vec)[:,1] max_pred = np.argmax(pred) rec = x_pool.index[max_pred] print('Your recommended recipe is:') print(recipes[rec]['recipe'])