def get_cluster_samples(self, data, num_clusters=5, max_epochs=5, limit=5000): """Create clusters using cosine similarity Keyword arguments: data -- data to be clustered num_clusters -- the number of clusters to create max_epochs -- maximum number of epochs to create clusters limit -- sample only this many items for faster clustering (-1 = no limit) Creates clusters by the K-Means clustering algorithm, using cosine similarity instead of more common euclidean distance Creates clusters until converged or max_epochs passes over the data """ if limit > 0: shuffle(data) data = data[:limit] cosine_clusters = CosineClusters(num_clusters) cosine_clusters.add_random_training_items(data) for i in range(0, max_epochs): print("Epoch "+str(i)) added = cosine_clusters.add_items_to_best_cluster(data) if added == 0: break centroids = cosine_clusters.get_centroids() outliers = cosine_clusters.get_outliers() randoms = cosine_clusters.get_randoms(3, self.verbose) return centroids + outliers + randoms
def get_high_uncertainty_cluster(self, model, unlabeled_data, method, feature_method, number=10, num_clusters=20, max_epochs=10, limit=10000): """Gets items from the cluster with the highest average uncertainty Keyword arguments: model -- machine learning model to get predictions from to determine uncertainty unlabeled_data -- data that does not yet have a label method -- method for uncertainty sampling (eg: least_confidence()) feature_method -- the method for extracting features from your data number -- number of items to sample num_clusters -- the number of clusters to create max_epochs -- maximum number of epochs to create clusters limit -- sample from only this many items for faster sampling (-1 = no limit) """ if limit > 0: shuffle(unlabeled_data) unlabeled_data = unlabeled_data[:limit] unlabeled_clusters = CosineClusters(num_clusters) unlabeled_clusters.add_random_training_items(unlabeled_data) for i in range(0, max_epochs): print("Epoch " + str(i)) added = unlabeled_clusters.add_items_to_best_cluster( unlabeled_data) if added == 0: break # get scores most_uncertain_cluster = None highest_average_uncertainty = 0.0 # for each cluster of unlabeled data for cluster in unlabeled_clusters.clusters: total_uncertainty = 0.0 count = 0 item_keys = list(cluster.members.keys()) for key in item_keys: item = cluster.members[key] text = item[1] feature_vector = feature_method(text) hidden, logits, log_probs = model(feature_vector, return_all_layers=True) prob_dist = torch.exp( log_probs ) # the probability distribution of our prediction score = method( prob_dist.data[0] ) # get the specific type of uncertainty sampling total_uncertainty += 1.0 count += 1 average_uncertainty = total_uncertainty / count if average_uncertainty > highest_average_uncertainty: highest_average_uncertainty = average_uncertainty most_uncertain_cluster = cluster samples = most_uncertain_cluster.get_random_members(number) return samples
def get_representative_cluster_samples(self, training_data, unlabeled_data, number=10, num_clusters=20, max_epochs=10, limit=10000): """Gets the most representative unlabeled items, compared to training data, across multiple clusters Keyword arguments: training_data -- data with a label, that the current model is trained on unlabeled_data -- data that does not yet have a label number -- number of items to sample limit -- sample from only this many items for faster sampling (-1 = no limit) num_clusters -- the number of clusters to create max_epochs -- maximum number of epochs to create clusters """ if limit > 0: shuffle(training_data) training_data = training_data[:limit] shuffle(unlabeled_data) unlabeled_data = unlabeled_data[:limit] # Create clusters for training data training_clusters = CosineClusters(num_clusters) training_clusters.add_random_training_items(training_data) for i in range(0, max_epochs): print("Epoch " + str(i)) added = training_clusters.add_items_to_best_cluster(training_data) if added == 0: break # Create clusters for unlabeled data unlabeled_clusters = CosineClusters(num_clusters) unlabeled_clusters.add_random_training_items(unlabeled_data) for i in range(0, max_epochs): print("Epoch " + str(i)) added = unlabeled_clusters.add_items_to_best_cluster( unlabeled_data) if added == 0: break # get scores most_representative_items = [] # for each cluster of unlabeled data for cluster in unlabeled_clusters.clusters: most_representative = None representativeness = float("-inf") # find the item in that cluster most like the unlabeled data item_keys = list(cluster.members.keys()) for key in item_keys: item = cluster.members[key] _r, unlabeled_score = unlabeled_clusters.get_best_cluster(item) _, training_score = training_clusters.get_best_cluster(item) cluster_representativeness = unlabeled_score - training_score if cluster_representativeness > representativeness: representativeness = cluster_representativeness most_representative = item most_representative[3] = "representative_clusters" most_representative[4] = representativeness most_representative_items.append(most_representative) most_representative_items.sort(reverse=True, key=lambda x: x[4]) return most_representative_items[:number:]