def __init__(self, verbose=False): self.verbose = verbose self.uncertainty_sampling = UncertaintySampling(self.verbose) self.diversity_sampling = DiversitySampling(self.verbose)
print( "Retraining model for Model-based Outliers or Deep Active Transfer Learning \n" ) # Need to split our training data to make a leave-out validation set: new_training_data = training_data[:int(len(training_data) * 0.9)] new_validation_data = training_data[len(new_training_data):] vocab_size = create_features() model_path = train_model(new_training_data, evaluation_data=evaluation_data, vocab_size=vocab_size) validation_model = SimpleTextClassifier(2, vocab_size) validation_model.load_state_dict(torch.load(model_path)) uncert_sampling = UncertaintySampling(verbose) diversity_samp = DiversitySampling(verbose) adv_samping = AdvancedActiveLearning(verbose) if number_cluster_based + number_representative + number_adaptive_representative + number_model_outliers > 0: print("Sampling for Diversity") # GET MODEL-BASED OUTLIER SAMPLES if number_model_outliers > 0: print("Sampling " + str(number_model_outliers) + " Model Outliers\n") sampled_data += diversity_samp.get_model_outliers( validation_model, data, new_validation_data,
class AdvancedActiveLearning(): def __init__(self, verbose=False): self.verbose = verbose self.uncertainty_sampling = UncertaintySampling(self.verbose) self.diversity_sampling = DiversitySampling(self.verbose) def get_clustered_uncertainty_samples(self, model, unlabeled_data, method, feature_method, perc_uncertain=0.1, num_clusters=20, max_epochs=10, limit=10000): """Gets the most uncertain items and then clusters the, sampling from each cluster Keyword arguments: model -- machine learning model to get predictions from to determine uncertainty unlabeled_data -- data that does not yet have a label method -- method for uncertainty sampling (eg: least_confidence()) feature_method -- the method for extracting features from your data perc_uncertain -- percentage of items through uncertainty sampling to cluster num_clusters -- the number of clusters to create max_epochs -- maximum number of epochs to create clusters limit -- sample from only this many predictions for faster sampling (-1 = no limit) """ if limit > 0: shuffle(unlabeled_data) unlabeled_data = unlabeled_data[:limit] uncertain_count = math.ceil(len(unlabeled_data) * perc_uncertain) uncertain_samples = self.uncertainty_sampling.get_samples( model, unlabeled_data, method, feature_method, uncertain_count, limit=limit) samples = self.diversity_sampling.get_cluster_samples( uncertain_samples, num_clusters=num_clusters) for item in samples: item[3] = method.__name__ + "_" + item[3] return samples def get_uncertain_model_outlier_samples(self, model, outlier_model, unlabeled_data, validation_data, method, feature_method, perc_uncertain=0.1, number=10, limit=10000): """Gets the most uncertain items and samples the biggest model outliers among them Keyword arguments: model -- machine learning model to get predictions from to determine uncertainty outlier_model -- machine learning model for outlier prediction validation_data -- data not used for the outlier_model but from the same distribution unlabeled_data -- data that does not yet have a label method -- method for uncertainty sampling (eg: least_confidence()) feature_method -- the method for extracting features from your data perc_uncertain -- percentage of items through uncertainty sampling to cluster number -- the final number of items to sample limit -- sample from only this many predictions for faster sampling (-1 = no limit) """ if limit > 0: shuffle(unlabeled_data) unlabeled_data = unlabeled_data[:limit] uncertain_count = math.ceil(len(unlabeled_data) * perc_uncertain) uncertain_samples = self.uncertainty_sampling.get_samples( model, unlabeled_data, method, feature_method, uncertain_count, limit=limit) samples = self.diversity_sampling.get_model_outliers(outlier_model, uncertain_samples, validation_data, feature_method, number=number, limit=limit) for item in samples: item[3] = method.__name__ + "_" + item[3] return samples def get_representative_cluster_samples(self, training_data, unlabeled_data, number=10, num_clusters=20, max_epochs=10, limit=10000): """Gets the most representative unlabeled items, compared to training data, across multiple clusters Keyword arguments: training_data -- data with a label, that the current model is trained on unlabeled_data -- data that does not yet have a label number -- number of items to sample limit -- sample from only this many items for faster sampling (-1 = no limit) num_clusters -- the number of clusters to create max_epochs -- maximum number of epochs to create clusters """ if limit > 0: shuffle(training_data) training_data = training_data[:limit] shuffle(unlabeled_data) unlabeled_data = unlabeled_data[:limit] # Create clusters for training data training_clusters = CosineClusters(num_clusters) training_clusters.add_random_training_items(training_data) for i in range(0, max_epochs): print("Epoch " + str(i)) added = training_clusters.add_items_to_best_cluster(training_data) if added == 0: break # Create clusters for unlabeled data unlabeled_clusters = CosineClusters(num_clusters) unlabeled_clusters.add_random_training_items(unlabeled_data) for i in range(0, max_epochs): print("Epoch " + str(i)) added = unlabeled_clusters.add_items_to_best_cluster( unlabeled_data) if added == 0: break # get scores most_representative_items = [] # for each cluster of unlabeled data for cluster in unlabeled_clusters.clusters: most_representative = None representativeness = float("-inf") # find the item in that cluster most like the unlabeled data item_keys = list(cluster.members.keys()) for key in item_keys: item = cluster.members[key] _r, unlabeled_score = unlabeled_clusters.get_best_cluster(item) _, training_score = training_clusters.get_best_cluster(item) cluster_representativeness = unlabeled_score - training_score if cluster_representativeness > representativeness: representativeness = cluster_representativeness most_representative = item most_representative[3] = "representative_clusters" most_representative[4] = representativeness most_representative_items.append(most_representative) most_representative_items.sort(reverse=True, key=lambda x: x[4]) return most_representative_items[:number:] def get_high_uncertainty_cluster(self, model, unlabeled_data, method, feature_method, number=10, num_clusters=20, max_epochs=10, limit=10000): """Gets items from the cluster with the highest average uncertainty Keyword arguments: model -- machine learning model to get predictions from to determine uncertainty unlabeled_data -- data that does not yet have a label method -- method for uncertainty sampling (eg: least_confidence()) feature_method -- the method for extracting features from your data number -- number of items to sample num_clusters -- the number of clusters to create max_epochs -- maximum number of epochs to create clusters limit -- sample from only this many items for faster sampling (-1 = no limit) """ if limit > 0: shuffle(unlabeled_data) unlabeled_data = unlabeled_data[:limit] unlabeled_clusters = CosineClusters(num_clusters) unlabeled_clusters.add_random_training_items(unlabeled_data) for i in range(0, max_epochs): print("Epoch " + str(i)) added = unlabeled_clusters.add_items_to_best_cluster( unlabeled_data) if added == 0: break # get scores most_uncertain_cluster = None highest_average_uncertainty = 0.0 # for each cluster of unlabeled data for cluster in unlabeled_clusters.clusters: total_uncertainty = 0.0 count = 0 item_keys = list(cluster.members.keys()) for key in item_keys: item = cluster.members[key] text = item[1] feature_vector = feature_method(text) hidden, logits, log_probs = model(feature_vector, return_all_layers=True) prob_dist = torch.exp( log_probs ) # the probability distribution of our prediction score = method( prob_dist.data[0] ) # get the specific type of uncertainty sampling total_uncertainty += 1.0 count += 1 average_uncertainty = total_uncertainty / count if average_uncertainty > highest_average_uncertainty: highest_average_uncertainty = average_uncertainty most_uncertain_cluster = cluster samples = most_uncertain_cluster.get_random_members(number) return samples def get_deep_active_transfer_learning_uncertainty_samples( self, model, unlabeled_data, validation_data, feature_method, number=100, limit=10000, epochs=10, select_per_epoch=100): """Uses transfer learning to predict uncertainty within the model Keyword arguments: model -- machine learning model to get predictions from to determine uncertainty unlabeled_data -- data that does not yet have a label validation_data -- data with a label that is not in the training set, to be used for transfer learning feature_method -- the method for extracting features from your data number -- number of items to sample epochs -- number of epochs to train transfer-learning model select_per_epoch -- number of items to train on per epoch of training limit -- sample from only this many items for faster sampling (-1 = no limit) """ correct_predictions = [] # validation items predicted correctly incorrect_predictions = [] # validation items predicted incorrectly item_hidden_layers = {} # hidden layer of each item, by id # 1 GET PREDICTIONS ON VALIDATION DATA FROM MODEL for item in validation_data: id = item[0] text = item[1] label = item[2] feature_vector = feature_method(text) hidden, logits, log_probs = model(feature_vector, return_all_layers=True) item_hidden_layers[id] = hidden prob_dist = torch.exp( log_probs) # the probability distribution of our prediction # get confidence that item is disaster-related prob_related = math.exp(log_probs.data.tolist()[0][1]) if item[3] == "seen": correct_predictions.append(item) elif (label == "1" and prob_related > 0.5) or (label == "0" and prob_related <= 0.5): correct_predictions.append(item) else: incorrect_predictions.append(item) # item.append(hidden) # the hidden layer will be the input to our new model # 2 BUILD A NEW MODEL TO PREDICT WHETHER VALIDATION ITEMS WERE CORRECT OR INCORRECT correct_model = SimpleUncertaintyPredictor(128) loss_function = nn.NLLLoss() optimizer = optim.SGD(correct_model.parameters(), lr=0.01) # print(correct_predictions) for epoch in range(epochs): if self.verbose: print("Epoch: " + str(epoch)) current = 0 # make a subset of data to use in this epoch # with an equal number of items from each label shuffle(correct_predictions ) # randomize the order of the validation data shuffle(incorrect_predictions ) # randomize the order of the validation data correct_ids = {} for item in correct_predictions: correct_ids[item[0]] = True epoch_data = correct_predictions[:select_per_epoch] epoch_data += incorrect_predictions[:select_per_epoch] shuffle(epoch_data) # train the final layers model for item in epoch_data: id = item[0] text = item[1] label = 0 if id in correct_ids: label = 1 correct_model.zero_grad() # print(item) feature_vec = item_hidden_layers[id] target = torch.LongTensor([label]) log_probs = correct_model(feature_vec) # compute loss function, do backward pass, and update the gradient loss = loss_function(log_probs, target) loss.backward(retain_graph=True) optimizer.step() # 3 PREDICT WHETHER UNLABELED ITEMS ARE CORRECT if limit > 0: shuffle(unlabeled_data) unlabeled_data = unlabeled_data[:limit] deep_active_transfer_preds = [] with torch.no_grad(): v = 0 for item in unlabeled_data: text = item[1] # get prediction from main model feature_vector = feature_method(text) hidden, logits, log_probs = model(feature_vector, return_all_layers=True) # use hidden layer from main model as input to model predicting correct/errors logits, log_probs = correct_model(hidden, return_all_layers=True) prob_dist = torch.exp( log_probs ) # the probability distribution of our prediction # get confidence that item is correctly labeled prob_correct = 1 - math.exp(log_probs.data.tolist()[0][1]) if (label == "0"): prob_correct = 1 - prob_correct item[3] = "predicted_error" item[4] = 1 - prob_correct deep_active_transfer_preds.append(item) deep_active_transfer_preds.sort(reverse=True, key=lambda x: x[4]) return deep_active_transfer_preds[:number:] def get_atlas_samples(self, model, unlabeled_data, validation_data, feature_method, number=100, limit=10000, number_per_iteration=10, epochs=10, select_per_epoch=100): """Uses transfer learning to predict uncertainty within the model Keyword arguments: model -- machine learning model to get predictions from to determine uncertainty unlabeled_data -- data that does not yet have a label validation_data -- data with a label that is not in the training set, to be used for transfer learning feature_method -- the method for extracting features from your data number -- number of items to sample number_per_iteration -- number of items to sample per iteration limit -- sample from only this many items for faster sampling (-1 = no limit) """ if (len(unlabeled_data) < number): raise Exception( 'More samples requested than the number of unlabeled items') atlas_samples = [] # all items sampled by atlas print(number) while (len(atlas_samples) < number): samples = self.get_deep_active_transfer_learning_uncertainty_samples( model, unlabeled_data, validation_data, feature_method, number_per_iteration, limit, epochs, select_per_epoch) for item in samples: atlas_samples.append(item) unlabeled_data.remove(item) item = copy.deepcopy(item) item[3] = "seen" # mark this item as already seen validation_data.append( item) # append so that it is in the next iteration print("DONE!") return atlas_samples