def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: if epoch != -1: if steps == -1: out_txt = " after epoch {}:".format(epoch) else: out_txt = " in epoch {} after {} steps:".format(epoch, steps) else: out_txt = ":" # logging.info("Evaluation the model on " + self.name + " dataset" + out_txt) embeddings1 = model.encode(self.sentences1, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_tensor=True) embeddings2 = model.encode(self.sentences2, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_tensor=True) num_pairs = embeddings1.shape[0] embed = [] meta_list = [] num = 200 if self.summary_path: meta_list.extend([ "idx-{}<S1>{}".format(i, s1) for (i, s1) in zip( range(min(num_pairs, num)), self.sentences1[:num]) ]) embed.append(embeddings1[:num, :]) meta_list.extend([ "idx-{}<S2>{}".format(i, s2) for (i, s2) in zip( range(min(num_pairs, num)), self.sentences2[:num]) ]) embed.append(embeddings2[:num, :]) embeddings = whitening_torch_final( torch.cat([embeddings1, embeddings2], dim=0)) embeddings1 = embeddings[:num_pairs, :] embeddings2 = embeddings[num_pairs:, :] if self.summary_path: meta_list.extend([ "white-idx-{}<WS1>{}".format(i, s1) for (i, s1) in zip( range(min(num_pairs, num)), self.sentences1[:num]) ]) embed.append(embeddings1[:num, :]) meta_list.extend([ "white-idx-{}<WS2>{}".format(i, s2) for (i, s2) in zip( range(min(num_pairs, num)), self.sentences2[:num]) ]) embed.append(embeddings2[:num, :]) embed = torch.cat(embed, dim=0) self.writer.add_embedding(embed, metadata=meta_list, tag="all{}".format(num * 4)) embeddings1 = embeddings1[:self.measure_data_num, :self.embed_dim] embeddings2 = embeddings2[:self.measure_data_num, :self.embed_dim] labels = self.scores[:self.measure_data_num] if self.intra_diversity: intra_div = self.compute_intra_diversity(embeddings1, embeddings2) logging.info("IntraDiversity on " + self.name + out_txt + ": {:.4f}".format(intra_div)) return intra_div cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2)) manhattan_distances = -paired_manhattan_distances( embeddings1, embeddings2) euclidean_distances = -paired_euclidean_distances( embeddings1, embeddings2) dot_products = [ np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2) ] eval_pearson_cosine, _ = pearsonr(labels, cosine_scores) eval_spearman_cosine, _ = spearmanr(labels, cosine_scores) eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances) eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances) eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances) eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances) eval_pearson_dot, _ = pearsonr(labels, dot_products) eval_spearman_dot, _ = spearmanr(labels, dot_products) logging.info("Eval on " + self.name + out_txt + "Cosine :\tPearson: {:.4f}\tSpearman: {:.4f}".format( eval_pearson_cosine, eval_spearman_cosine)) # logging.info("Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format( # eval_pearson_manhattan, eval_spearman_manhattan)) # logging.info("Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format( # eval_pearson_euclidean, eval_spearman_euclidean)) # logging.info("Dot-Product-Similarity:\tPearson: {:.4f}\tSpearman: {:.4f}".format( # eval_pearson_dot, eval_spearman_dot)) # logging.info("Eval on "+self.name+out_txt+"Cosine3 :\tPearson: {:.4f}\tSpearman: {:.4f}".format( # eval_pearson_cosine3, eval_spearman_cosine3)) if output_path is not None: csv_path = os.path.join(output_path, self.csv_file) output_file_exists = os.path.isfile(csv_path) with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f: writer = csv.writer(f) if not output_file_exists: writer.writerow(self.csv_headers) writer.writerow([ epoch, steps, eval_pearson_cosine, eval_spearman_cosine, eval_pearson_euclidean, eval_spearman_euclidean, eval_pearson_manhattan, eval_spearman_manhattan, eval_pearson_dot, eval_spearman_dot ]) if self.main_similarity == SimilarityFunction.COSINE: return eval_spearman_cosine elif self.main_similarity == SimilarityFunction.EUCLIDEAN: return eval_spearman_euclidean elif self.main_similarity == SimilarityFunction.MANHATTAN: return eval_spearman_manhattan elif self.main_similarity == SimilarityFunction.DOT_PRODUCT: return eval_spearman_dot elif self.main_similarity is None: return max(eval_spearman_cosine, eval_spearman_manhattan, eval_spearman_euclidean, eval_spearman_dot) else: raise ValueError("Unknown main_similarity value")
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: if epoch != -1: if steps == -1: out_txt = " after epoch {}:".format(epoch) else: out_txt = " in epoch {} after {} steps:".format(epoch, steps) else: out_txt = ":" logging.info("TripletEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt) num_triplets = 0 num_correct_cos_triplets, num_correct_manhatten_triplets, num_correct_euclidean_triplets = 0, 0, 0 embeddings_anchors = model.encode( self.anchors, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True) embeddings_positives = model.encode( self.positives, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True) embeddings_negatives = model.encode( self.negatives, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True) #Cosine distance pos_cos_distance = paired_cosine_distances(embeddings_anchors, embeddings_positives) neg_cos_distances = paired_cosine_distances(embeddings_anchors, embeddings_negatives) # Manhatten pos_manhatten_distance = paired_manhattan_distances( embeddings_anchors, embeddings_positives) neg_manhatten_distances = paired_manhattan_distances( embeddings_anchors, embeddings_negatives) # Euclidean pos_euclidean_distance = paired_euclidean_distances( embeddings_anchors, embeddings_positives) neg_euclidean_distances = paired_euclidean_distances( embeddings_anchors, embeddings_negatives) for idx in range(len(pos_cos_distance)): num_triplets += 1 if pos_cos_distance[idx] < neg_cos_distances[idx]: num_correct_cos_triplets += 1 if pos_manhatten_distance[idx] < neg_manhatten_distances[idx]: num_correct_manhatten_triplets += 1 if pos_euclidean_distance[idx] < neg_euclidean_distances[idx]: num_correct_euclidean_triplets += 1 accuracy_cos = num_correct_cos_triplets / num_triplets accuracy_manhatten = num_correct_manhatten_triplets / num_triplets accuracy_euclidean = num_correct_euclidean_triplets / num_triplets logging.info("Accuracy Cosine Distance: \t{:.2f}".format( accuracy_cos * 100)) logging.info("Accuracy Manhatten Distance:\t{:.2f}".format( accuracy_manhatten * 100)) logging.info("Accuracy Euclidean Distance:\t{:.2f}\n".format( accuracy_euclidean * 100)) if output_path is not None: csv_path = os.path.join(output_path, self.csv_file) if not os.path.isfile(csv_path): with open(csv_path, mode="w", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(self.csv_headers) writer.writerow([ epoch, steps, accuracy_cos, accuracy_manhatten, accuracy_euclidean ]) else: with open(csv_path, mode="a", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow([ epoch, steps, accuracy_cos, accuracy_manhatten, accuracy_euclidean ]) if self.main_distance_function == SimilarityFunction.COSINE: return accuracy_cos if self.main_distance_function == SimilarityFunction.MANHATTAN: return accuracy_manhatten if self.main_distance_function == SimilarityFunction.EUCLIDEAN: return accuracy_euclidean return max(accuracy_cos, accuracy_manhatten, accuracy_euclidean)
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: if epoch != -1: if steps == -1: out_txt = f" after epoch {epoch}:" else: out_txt = f" in epoch {epoch} after {steps} steps:" else: out_txt = ":" logging.info("Binary Accuracy Evaluation of the model on " + self.name + " dataset" + out_txt) embeddings1 = model.encode(self.sentences1, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True) embeddings2 = model.encode(self.sentences2, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True) cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2) manhattan_distances = paired_manhattan_distances( embeddings1, embeddings2) euclidean_distances = paired_euclidean_distances( embeddings1, embeddings2) dot_products = [ np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2) ] labels = np.asarray(self.labels) cosine_acc, cosine_threshold = self.find_best_acc_and_threshold( cosine_scores, labels, True) manhattan_acc, manhatten_threshold = self.find_best_acc_and_threshold( manhattan_distances, labels, False) euclidean_acc, euclidean_threshold = self.find_best_acc_and_threshold( euclidean_distances, labels, False) dot_acc, dot_threshold = self.find_best_acc_and_threshold( dot_products, labels, False) logging.info( "Accuracy with Cosine-Similarity:\t{:.2f}\t(Threshold: {:.4f})". format(cosine_acc * 100, cosine_threshold)) logging.info( "Accuracy with Manhattan-Distance:\t{:.2f}\t(Threshold: {:.4f})". format(manhattan_acc * 100, manhatten_threshold)) logging.info( "Accuracy with Euclidean-Distance:\t{:.2f}\t(Threshold: {:.4f})". format(euclidean_acc * 100, euclidean_threshold)) logging.info( "Accuracy with Dot-Product:\t{:.2f}\t(Threshold: {:.4f})\n".format( dot_acc * 100, dot_threshold)) if output_path is not None: csv_path = os.path.join(output_path, self.csv_file) if not os.path.isfile(csv_path): with open(csv_path, mode="w", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(self.csv_headers) writer.writerow([ epoch, steps, cosine_acc, cosine_threshold, manhattan_acc, manhatten_threshold, euclidean_acc, euclidean_threshold, dot_acc, dot_threshold ]) else: with open(csv_path, mode="a", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow([ epoch, steps, cosine_acc, cosine_threshold, manhattan_acc, manhatten_threshold, euclidean_acc, euclidean_threshold, dot_acc, dot_threshold ]) if self.main_similarity == SimilarityFunction.COSINE: return cosine_acc elif self.main_similarity == SimilarityFunction.EUCLIDEAN: return euclidean_acc elif self.main_similarity == SimilarityFunction.MANHATTAN: return manhattan_acc else: raise ValueError("Unknown main_similarity value")
def get_tfidf_count_hash_features(ql, qr, vectorModels, signature=""): texts_ql = [] texts_qr = [] res = ["%s" %(token.term) for token in ql.basic_words] texts_ql.append(" ".join(res)) res = ["%s" %(token.term) for token in qr.basic_words] texts_qr.append(" ".join(res)) feature_dict = {} ql_tfidf1, ql_count1, ql_hash1_18, ql_hash1_20, ql_hash2_18, ql_hash2_20, qr_tfidf1, qr_count1, qr_hash1_18, qr_hash1_20, qr_hash2_18, qr_hash2_20 = get_count_tfidf_hash(texts_ql, texts_qr, vectorModels) if signature: signature = signature + "_" tfidf1_PED = paired_euclidean_distances(ql_tfidf1, qr_tfidf1) feature_dict[signature+ 'tfidf1_PED'] = float(tfidf1_PED[0]) count1_PED = paired_euclidean_distances(ql_count1, qr_count1) feature_dict[signature+ 'count1_PED'] = float(count1_PED[0]) hash1_18_PED = paired_euclidean_distances(ql_hash1_18, qr_hash1_18) feature_dict[signature+ 'hash1_18_PED'] = float(hash1_18_PED[0]) hash1_20_PED = paired_euclidean_distances(ql_hash1_20, qr_hash1_20) feature_dict[signature+ 'hash1_20_PED'] = float(hash1_20_PED[0]) #------------------------------------------------------------------- tfidf1_PCD = paired_cosine_distances(ql_tfidf1, qr_tfidf1) feature_dict[signature+ 'tfidf1_PCD'] = float(tfidf1_PCD[0]) count1_PCD = paired_cosine_distances(ql_count1, qr_count1) feature_dict[signature+ 'count1_PCD'] = float(count1_PCD[0]) hash1_18_PCD = paired_cosine_distances(ql_hash1_18, qr_hash1_18) feature_dict[signature+ 'hash1_18_PCD'] = float(hash1_18_PCD[0]) hash1_20_PCD = paired_cosine_distances(ql_hash1_20, qr_hash1_20) feature_dict[signature+ 'hash1_20_PCD'] = float(hash1_20_PCD[0]) hash2_18_PCD = paired_cosine_distances(ql_hash2_18, qr_hash2_18) feature_dict[signature+ 'hash2_18_PCD'] = float(hash2_18_PCD[0]) hash2_20_PCD = paired_cosine_distances(ql_hash2_20, qr_hash2_20) feature_dict[signature+ 'hash2_20_PCD'] = float(hash2_20_PCD[0]) #------------------------------------------------------------------ tfidf1_PMD = paired_manhattan_distances(ql_tfidf1, qr_tfidf1) feature_dict[signature+ 'tfidf1_PMD'] = float(tfidf1_PMD) count1_PMD = paired_manhattan_distances(ql_count1, qr_count1) feature_dict[signature+ 'count1_PMD'] = float(count1_PMD) hash1_18_PMD = paired_manhattan_distances(ql_hash1_18, qr_hash1_18) feature_dict[signature+ 'hash1_18_PMD'] = float(hash1_18_PMD) hash1_20_PMD = paired_manhattan_distances(ql_hash1_20, qr_hash1_20) feature_dict[signature+ 'hash1_20_PMD'] = float(hash1_20_PMD) hash2_18_PMD = paired_manhattan_distances(ql_hash2_18, qr_hash2_18) feature_dict[signature+ 'hash2_18_PMD'] = float(hash2_18_PMD) hash2_20_PMD = paired_manhattan_distances(ql_hash2_20, qr_hash2_20) feature_dict[signature+ 'hash2_20_PMD'] = float(hash2_20_PMD) return feature_dict
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: if epoch != -1: if steps == -1: out_txt = " after epoch {}:".format(epoch) else: out_txt = " in epoch {} after {} steps:".format(epoch, steps) else: out_txt = ":" logger.info("EmbeddingSimilarityEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt) embeddings1 = model.encode(self.sentences1, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True) embeddings2 = model.encode(self.sentences2, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True) labels = self.scores cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2)) manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2) euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2) dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)] eval_pearson_cosine, _ = pearsonr(labels, cosine_scores) eval_spearman_cosine, _ = spearmanr(labels, cosine_scores) eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances) eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances) eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances) eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances) eval_pearson_dot, _ = pearsonr(labels, dot_products) eval_spearman_dot, _ = spearmanr(labels, dot_products) logger.info("Cosine-Similarity :\tPearson: {:.4f}\tSpearman: {:.4f}".format( eval_pearson_cosine, eval_spearman_cosine)) logger.info("Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format( eval_pearson_manhattan, eval_spearman_manhattan)) logger.info("Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format( eval_pearson_euclidean, eval_spearman_euclidean)) logger.info("Dot-Product-Similarity:\tPearson: {:.4f}\tSpearman: {:.4f}".format( eval_pearson_dot, eval_spearman_dot)) if output_path is not None: csv_path = os.path.join(output_path, self.csv_file) output_file_exists = os.path.isfile(csv_path) with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f: writer = csv.writer(f) if not output_file_exists: writer.writerow(self.csv_headers) writer.writerow([epoch, steps, eval_pearson_cosine, eval_spearman_cosine, eval_pearson_euclidean, eval_spearman_euclidean, eval_pearson_manhattan, eval_spearman_manhattan, eval_pearson_dot, eval_spearman_dot]) if self.main_similarity == SimilarityFunction.COSINE: return eval_spearman_cosine elif self.main_similarity == SimilarityFunction.EUCLIDEAN: return eval_spearman_euclidean elif self.main_similarity == SimilarityFunction.MANHATTAN: return eval_spearman_manhattan elif self.main_similarity == SimilarityFunction.DOT_PRODUCT: return eval_spearman_dot elif self.main_similarity is None: return max(eval_spearman_cosine, eval_spearman_manhattan, eval_spearman_euclidean, eval_spearman_dot) else: raise ValueError("Unknown main_similarity value")
ft = FastTextKeyedVectors.load("D:/fasttext_300_3_polish.bin") models[f"CBOW-FT"] = Average(ft, lang_freq="pl") models[f"SIF-FT"] = SIF(ft, components=10) models[f"uSIF-FT"] = uSIF(ft, length=11) s=models[f"uSIF-W2V"] s.sv[0] cs, md, ed = [],[],[] for i, j in zip(range(task_length), range(task_length, 2*task_length)): temp1 = s.sv[i].reshape(1, -1) temp2 = s.sv[j].reshape(1, -1) cs.append((1 - (paired_cosine_distances(temp1, temp2)))[0]) md.append(-paired_manhattan_distances(temp1, temp2)[0]) ed.append(-paired_euclidean_distances(temp1, temp2)[0]) eval_pearson_cosine, _ = pearsonr(similarities, cs) eval_spearman_cosine, _ = spearmanr(similarities, cs) eval_pearson_manhattan, _ = pearsonr(similarities, md) eval_spearman_manhattan, _ = spearmanr(similarities, md) eval_pearson_euclidean, _ = pearsonr(similarities, ed) eval_spearman_euclidean, _ = spearmanr(similarities, ed) def compute_similarities(task_length, model): sims = [] for i, j in zip(range(task_length), range(task_length, 2*task_length)): sims.append(model.sv.similarity(i,j)) print(sims)
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: model.eval() if epoch != -1: if steps == -1: out_txt = f" after epoch {epoch}:" else: out_txt = f" in epoch {epoch} after {steps} steps:" else: out_txt = ":" logging.info("Evaluation the model on " + self.name + " dataset" + out_txt) num_triplets = 0 num_correct_cos_triplets, num_correct_manhatten_triplets, num_correct_euclidean_triplets = 0, 0, 0 self.dataloader.collate_fn = model.smart_batching_collate for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")): features, label_ids = batch_to_device(batch, model.device) with torch.no_grad(): emb1, emb2, emb3 = [ model(sent_features)['sentence_embedding'].to( "cpu").numpy() for sent_features in features ] #Cosine distance pos_cos_distance = paired_cosine_distances(emb1, emb2) neg_cos_distances = paired_cosine_distances(emb1, emb3) # Manhatten pos_manhatten_distance = paired_manhattan_distances(emb1, emb2) neg_manhatten_distances = paired_manhattan_distances(emb1, emb3) # Euclidean pos_euclidean_distance = paired_euclidean_distances(emb1, emb2) neg_euclidean_distances = paired_euclidean_distances(emb1, emb3) for idx in range(len(pos_cos_distance)): num_triplets += 1 if pos_cos_distance[idx] < neg_cos_distances[idx]: num_correct_cos_triplets += 1 if pos_manhatten_distance[idx] < neg_manhatten_distances[idx]: num_correct_manhatten_triplets += 1 if pos_euclidean_distance[idx] < neg_euclidean_distances[idx]: num_correct_euclidean_triplets += 1 accuracy_cos = num_correct_cos_triplets / num_triplets accuracy_manhatten = num_correct_manhatten_triplets / num_triplets accuracy_euclidean = num_correct_euclidean_triplets / num_triplets logging.info("Accuracy Cosine Distance:\t{:.4f}".format(accuracy_cos)) logging.info( "Accuracy Manhatten Distance:\t{:.4f}".format(accuracy_manhatten)) logging.info("Accuracy Euclidean Distance:\t{:.4f}\n".format( accuracy_euclidean)) if output_path is not None: csv_path = os.path.join(output_path, self.csv_file) if not os.path.isfile(csv_path): with open(csv_path, mode="w", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(self.csv_headers) writer.writerow([ epoch, steps, accuracy_cos, accuracy_manhatten, accuracy_euclidean ]) else: with open(csv_path, mode="a", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow([ epoch, steps, accuracy_cos, accuracy_manhatten, accuracy_euclidean ]) if self.main_distance_function == SimilarityFunction.COSINE: return accuracy_cos if self.main_distance_function == SimilarityFunction.MANHATTAN: return accuracy_manhatten if self.main_distance_function == SimilarityFunction.EUCLIDEAN: return accuracy_euclidean return max(accuracy_cos, accuracy_manhatten, accuracy_euclidean)
def test_paired_manhattan_distances(): # Check the paired manhattan distances computation X = [[0], [0]] Y = [[1], [2]] D = paired_manhattan_distances(X, Y) assert_array_almost_equal(D, [1., 2.])
def __call__(self, model: 'SequentialSentenceEmbedder', output_path: str = None, epoch: int = -1, steps: int = -1, additional_evaluator: Callable[[], float] = None) -> float: model.eval() if epoch != -1: if steps == -1: out_txt = f" after epoch {epoch}:" else: out_txt = f" in epoch {epoch} after {steps} steps:" else: out_txt = ":" logging.info("Evaluation the model on " + self.name + " dataset" + out_txt) self.dataloader.collate_fn = model.smart_batching_collate iterator = self.dataloader if self.show_progress_bar: iterator = tqdm(iterator, desc="Convert Evaluating") embeddings1, embeddings2, labels = paired_embeddings_for_dataloader( self.device, model, iterator) try: cosine_scores = 1 - (paired_cosine_distances( embeddings1, embeddings2)) except Exception as e: print(embeddings1) print(embeddings2) raise (e) manhattan_distances = -paired_manhattan_distances( embeddings1, embeddings2) euclidean_distances = -paired_euclidean_distances( embeddings1, embeddings2) dot_products = [ np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2) ] eval_pearson_cosine, _ = pearsonr(labels, cosine_scores) eval_spearman_cosine, _ = spearmanr(labels, cosine_scores) eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances) eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances) eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances) eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances) eval_pearson_dot, _ = pearsonr(labels, dot_products) eval_spearman_dot, _ = spearmanr(labels, dot_products) logging.info( "Cosine-Similarity :\tPearson: {:.4f}\tSpearman: {:.4f}".format( eval_pearson_cosine, eval_spearman_cosine)) logging.info( "Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format( eval_pearson_manhattan, eval_spearman_manhattan)) logging.info( "Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format( eval_pearson_euclidean, eval_spearman_euclidean)) logging.info( "Dot-Product-Similarity:\tPearson: {:.4f}\tSpearman: {:.4f}". format(eval_pearson_dot, eval_spearman_dot)) self.csv_headers = [ "epoch", "steps", "cosine_pearson", "cosine_spearman", "euclidean_pearson", "euclidean_spearman", "manhattan_pearson", "manhattan_spearman", "dot_pearson", "dot_spearman" ] if additional_evaluator is not None: additional_val = additional_evaluator() self.csv_headers += [self.trec_metric] logging.info("Additional metric: " + self.trec_metric + " value: {:.4f}".format(additional_val)) if output_path is not None: csv_path = os.path.join(output_path, self.csv_file) output_file_exists = os.path.isfile(csv_path) with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f: writer = csv.writer(f) if not output_file_exists: writer.writerow(self.csv_headers) out_row = [ epoch, steps, eval_pearson_cosine, eval_spearman_cosine, eval_pearson_euclidean, eval_spearman_euclidean, eval_pearson_manhattan, eval_spearman_manhattan, eval_pearson_dot, eval_spearman_dot ] if additional_evaluator is not None: out_row += [additional_val] writer.writerow(out_row) if additional_evaluator is not None: return additional_val if self.main_similarity == SimilarityFunction.COSINE: return eval_spearman_cosine elif self.main_similarity == SimilarityFunction.EUCLIDEAN: return eval_spearman_euclidean elif self.main_similarity == SimilarityFunction.MANHATTAN: return eval_spearman_manhattan elif self.main_similarity == SimilarityFunction.DOT_PRODUCT: return eval_spearman_dot elif self.main_similarity is None: return max(eval_spearman_cosine, eval_spearman_manhattan, eval_spearman_euclidean, eval_spearman_dot) else: raise ValueError("Unknown main_similarity value")
#%% man = np.abs(a - b).sum() #%% help(cityblock) #%% np.isclose(man, cityblock(a, b)) #%% help(paired_manhattan_distances) #%% np.isclose(man, paired_manhattan_distances(a.reshape((1, -1)), b.reshape((1, -1)))) #%% [markdown] # ## コサイン類似度 # --- # 2 つのベクトルの間の角度のコサインで、両ベクトルの類似度を表す。 # $ # \displaystyle \begin{aligned} # cos\theta & =\frac # {a\cdot b} # {\| a\| \ \| b\| } # \\ # & =\frac # {\displaystyle \sum ^{n}_{i=1} a_{i} b_{i}} # {\sqrt{ # \displaystyle \sum ^{n}_{i=1} a^{2}_{i}
def get_sentvec_features(word2vec, word_weights, model, sent_l, sent_r, signature=""): feature_dict = {} ql = ["%s" % (word.term.decode('utf8')) for word in sent_l.basic_words] qr = ["%s" % (word.term.decode('utf8')) for word in sent_r.basic_words] dim = word2vec.vectors.shape[1] sif_vec_ql_weight, naive_vec_ql_weight = calc_naive_sif_embedding( word2vec, dim, word_weights, model, ql) sif_vec_qr_weight, naive_vec_qr_weight = calc_naive_sif_embedding( word2vec, dim, word_weights, model, qr) if signature: signature = signature + "_" #calculate sif feature if np.isnan(sif_vec_ql_weight[0][0]) or np.isnan(sif_vec_qr_weight[0][0]): feature_dict[signature + 'sif_sentvec_weight_PED'] = 1.0 feature_dict[signature + 'sif_sentvec_weight_PCD'] = 1.0 feature_dict[signature + 'sif_sentvec_weight_PMD'] = 1000.0 else: sif_sentvec_weight_PED = paired_euclidean_distances( sif_vec_ql_weight, sif_vec_qr_weight) feature_dict[signature + 'sif_sentvec_weight_PED'] = float( sif_sentvec_weight_PED[0]) sif_sentvec_weight_PCD = paired_cosine_distances( sif_vec_ql_weight, sif_vec_qr_weight) feature_dict[signature + 'sif_sentvec_weight_PCD'] = float( sif_sentvec_weight_PCD[0]) sif_sentvec_weight_PMD = paired_manhattan_distances( sif_vec_ql_weight, sif_vec_qr_weight) feature_dict[signature + 'sif_sentvec_weight_PMD'] = float( sif_sentvec_weight_PMD[0]) #calculate naive feature if np.isnan(naive_vec_ql_weight[0][0]) or np.isnan( naive_vec_qr_weight[0][0]): feature_dict[signature + 'avg_sentvec_weight_PED'] = 1.0 feature_dict[signature + 'avg_sentvec_weight_PCD'] = 1.0 feature_dict[signature + 'avg_sentvec_weight_PMD'] = 1000.0 else: naive_sentvec_weight_PED = paired_euclidean_distances( naive_vec_ql_weight, naive_vec_qr_weight) feature_dict[signature + 'avg_sentvec_weight_PED'] = float( naive_sentvec_weight_PED[0]) naive_sentvec_weight_PCD = paired_cosine_distances( naive_vec_ql_weight, naive_vec_qr_weight) feature_dict[signature + 'avg_sentvec_weight_PCD'] = float( naive_sentvec_weight_PCD[0]) naive_sentvec_weight_PMD = paired_manhattan_distances( naive_vec_ql_weight, naive_vec_qr_weight) feature_dict[signature + 'avg_sentvec_weight_PMD'] = float( naive_sentvec_weight_PMD[0]) #calculate wmdistance feature_dict[signature + 'WMD'] = word2vec.wmdistance(ql, qr) return feature_dict
def __call__(self, model: TransformerModel, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: model.eval() embeddings1 = [] embeddings2 = [] labels = [] if epoch != -1: if steps == -1: out_txt = f" after epoch {epoch}:" else: out_txt = f" in epoch {epoch} after {steps} steps:" else: out_txt = ":" logging.info("Evaluation the model on " + self.name + " dataset" + out_txt) for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")): batch = batch_to_device(batch, self.device) input_ids, segment_ids, input_masks, label_ids = batch with torch.no_grad(): emb1 = model.get_sentence_representation( input_ids[0], segment_ids[0], input_masks[0]).to("cpu").numpy() emb2 = model.get_sentence_representation( input_ids[1], segment_ids[1], input_masks[1]).to("cpu").numpy() labels.extend(label_ids.to("cpu").numpy()) embeddings1.extend(emb1) embeddings2.extend(emb2) cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2)) manhattan_distances = -paired_manhattan_distances( embeddings1, embeddings2) euclidean_distances = -paired_euclidean_distances( embeddings1, embeddings2) eval_pearson_cosine, _ = pearsonr(labels, cosine_scores) eval_spearman_cosine, _ = spearmanr(labels, cosine_scores) eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances) eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances) eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances) eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances) logging.info( "Cosine-Similarity :\tPearson: {:.4f}\tSpearman: {:4f}".format( eval_pearson_cosine, eval_spearman_cosine)) logging.info( "Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:4f}".format( eval_pearson_manhattan, eval_spearman_manhattan)) logging.info( "Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:4f}".format( eval_pearson_euclidean, eval_spearman_euclidean)) if output_path is not None: csv_path = os.path.join(output_path, self.csv_file) if not os.path.isfile(csv_path): with open(csv_path, mode="w", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(self.csv_headers) writer.writerow([ epoch, steps, eval_pearson_cosine, eval_spearman_cosine, eval_pearson_euclidean, eval_spearman_euclidean, eval_pearson_manhattan, eval_spearman_manhattan ]) else: with open(csv_path, mode="a", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow([ epoch, steps, eval_pearson_cosine, eval_spearman_cosine, eval_pearson_euclidean, eval_spearman_euclidean, eval_pearson_manhattan, eval_spearman_manhattan ]) if self.main_similarity == EmbeddingSimilarity.COSINE: return eval_spearman_cosine elif self.main_similarity == EmbeddingSimilarity.EUCLIDEAN: return eval_spearman_euclidean elif self.main_similarity == EmbeddingSimilarity.MANHATTAN: return eval_spearman_manhattan elif self.main_similarity is None: return max(eval_spearman_cosine, eval_spearman_manhattan, eval_spearman_euclidean) else: raise ValueError("Unknown main_similarity value")
def manhattan_distances(embeddings1, embeddings2): return -paired_manhattan_distances(embeddings1, embeddings2)
def __call__(self, model: 'SequentialSentenceEmbedder', output_path: str = None, epoch: int = -1, steps: int = -1) -> float: model.eval() embeddings1 = [] embeddings2 = [] labels = [] if epoch != -1: if steps == -1: out_txt = f" after epoch {epoch}:" else: out_txt = f" in epoch {epoch} after {steps} steps:" else: out_txt = ":" logging.info("Evaluation the model on " + self.name + " dataset" + out_txt) self.dataloader.collate_fn = model.smart_batching_collate iterator = self.dataloader if self.show_progress_bar: iterator = tqdm(iterator, desc="Convert Evaluating") for step, batch in enumerate(iterator): features, label_ids = batch_to_device(batch, self.device) with torch.no_grad(): emb1, emb2 = [ model(sent_features)['sentence_embedding'].to( "cpu").numpy() for sent_features in features ] labels.extend(label_ids.to("cpu").numpy()) embeddings1.extend(emb1) embeddings2.extend(emb2) try: cosine_scores = 1 - (paired_cosine_distances( embeddings1, embeddings2)) except Exception as e: print(embeddings1) print(embeddings2) raise (e) manhattan_distances = -paired_manhattan_distances( embeddings1, embeddings2) euclidean_distances = -paired_euclidean_distances( embeddings1, embeddings2) dot_products = [ np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2) ] eval_pearson_cosine, _ = pearsonr(labels, cosine_scores) eval_spearman_cosine, _ = spearmanr(labels, cosine_scores) eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances) eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances) eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances) eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances) eval_pearson_dot, _ = pearsonr(labels, dot_products) eval_spearman_dot, _ = spearmanr(labels, dot_products) logging.info( "Cosine-Similarity :\tPearson: {:.4f}\tSpearman: {:.4f}".format( eval_pearson_cosine, eval_spearman_cosine)) logging.info( "Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format( eval_pearson_manhattan, eval_spearman_manhattan)) logging.info( "Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format( eval_pearson_euclidean, eval_spearman_euclidean)) logging.info( "Dot-Product-Similarity:\tPearson: {:.4f}\tSpearman: {:.4f}". format(eval_pearson_dot, eval_spearman_dot)) if output_path is not None: csv_path = os.path.join(output_path, self.csv_file) output_file_exists = os.path.isfile(csv_path) with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f: writer = csv.writer(f) if not output_file_exists: writer.writerow(self.csv_headers) writer.writerow([ epoch, steps, eval_pearson_cosine, eval_spearman_cosine, eval_pearson_euclidean, eval_spearman_euclidean, eval_pearson_manhattan, eval_spearman_manhattan, eval_pearson_dot, eval_spearman_dot ]) if self.main_similarity == SimilarityFunction.COSINE: return eval_spearman_cosine elif self.main_similarity == SimilarityFunction.EUCLIDEAN: return eval_spearman_euclidean elif self.main_similarity == SimilarityFunction.MANHATTAN: return eval_spearman_manhattan elif self.main_similarity == SimilarityFunction.DOT_PRODUCT: return eval_spearman_dot elif self.main_similarity is None: return max(eval_spearman_cosine, eval_spearman_manhattan, eval_spearman_euclidean, eval_spearman_dot) else: raise ValueError("Unknown main_similarity value")
Original file is located at https://colab.research.google.com/drive/1UTrfj1JfPSMo52T6tTXPs3V9Q67zJ7OD """ from sklearn.metrics.pairwise import paired_euclidean_distances X = [[0, 1, 2, 3]] Y = [[1, 2, 3, 4]] paired_euclidean_distances(X, Y) from sklearn.metrics.pairwise import paired_manhattan_distances X = [[0, 1, 2, 3]] Y = [[1, 2, 3, 4]] paired_manhattan_distances(X, Y) movie_a = [0, 2, 1, 3] # user_id’s who bought the movie a movie_b = [0, 1, 2, 3] # user_id’s who bought the movie b def jaccard_similarity(list1, list2): intersection = len(list(set(list1).intersection(list2))) union = (len(list1) + len(list2)) - intersection return float(intersection / union) movie_a = [0, 2, 1, 3] # user_id’s who bought the movie a movie_b = [0, 1, 2, 3] # user_id’s who bought the movie b print(jaccard_similarity(movie_a, movie_b))
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: model.eval() embeddings1 = [] embeddings2 = [] labels = [] if epoch != -1: if steps == -1: out_txt = " after epoch {}:".format(epoch) else: out_txt = " in epoch {} after {} steps:".format(epoch, steps) else: out_txt = ":" logging.info("Evaluation the model on " + self.name + " dataset" + out_txt) self.dataloader.collate_fn = model.smart_batching_collate for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")): features, label_ids = batch_to_device(batch, self.device) with torch.no_grad(): emb1, emb2 = [ model(sent_features)['sentence_embedding'].to( "cpu").numpy() for sent_features in features ] labels.extend(label_ids.to("cpu").numpy()) embeddings1.extend(emb1) embeddings2.extend(emb2) cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2)) manhattan_distances = -paired_manhattan_distances( embeddings1, embeddings2) euclidean_distances = -paired_euclidean_distances( embeddings1, embeddings2) # Ensure labels are just 0 or 1 for label in labels: assert (label == 0 or label == 1) cosine_middle = np.median(cosine_scores) cosine_acc = 0 for label, score in zip(labels, cosine_scores): if (label == 1 and score > cosine_middle) or (label == 0 and score <= cosine_middle): cosine_acc += 1 cosine_acc /= len(labels) manhattan_middle = np.median(manhattan_distances) manhattan_acc = 0 for label, score in zip(labels, manhattan_distances): if (label == 1 and score > manhattan_middle) or ( label == 0 and score <= manhattan_middle): manhattan_acc += 1 manhattan_acc /= len(labels) euclidean_middle = np.median(euclidean_distances) euclidean_acc = 0 for label, score in zip(labels, euclidean_distances): if (label == 1 and score > euclidean_middle) or ( label == 0 and score <= euclidean_middle): euclidean_acc += 1 euclidean_acc /= len(labels) logging.info("Cosine-Classification:\t{:4f}".format(cosine_acc)) logging.info("Manhattan-Classification:\t{:4f}".format(manhattan_acc)) logging.info( "Euclidean-Classification:\t{:4f}\n".format(euclidean_acc)) if output_path is not None: csv_path = os.path.join(output_path, self.csv_file) if not os.path.isfile(csv_path): with open(csv_path, mode="w", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(self.csv_headers) writer.writerow([ epoch, steps, cosine_acc, euclidean_acc, manhattan_acc ]) else: with open(csv_path, mode="a", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow([ epoch, steps, cosine_acc, euclidean_acc, manhattan_acc ]) if self.main_similarity == SimilarityFunction.COSINE: return cosine_acc elif self.main_similarity == SimilarityFunction.EUCLIDEAN: return euclidean_acc elif self.main_similarity == SimilarityFunction.MANHATTAN: return manhattan_acc else: raise ValueError("Unknown main_similarity value")
cs99 = paired_euclidean_distances(x.values.reshape(1, -1), [df1]) cs100 = paired_euclidean_distances(x1.values.reshape(1, -1), [df2]) print(cs99) ### [34.21532517] print(cs100) ### [28.53037718] print(np.argmax(cs99)) ## 0 print(np.argmax(cs100)) ### 0 r = [0, 0] cs_49 = [[34.21532517], [28.53037718]] print(cs_49) ############# [[34.21532517],[28.53037718]] np.argmax(cs_49) ### 0 from sklearn.metrics.pairwise import paired_manhattan_distances x = ratings.iloc[100:250, 1] df1 = ratings.iloc[1, :150] x1 = ratings.iloc[100:175, 2] df2 = ratings.iloc[2, :75] cs101 = paired_manhattan_distances(x.values.reshape(1, -1), [df1]) cs102 = paired_manhattan_distances(x1.values.reshape(1, -1), [df2]) print(cs101) ### [126.03125] print(cs102) ### array[167.625] print(np.argmax(cs101)) ## 0 print(np.argmax(cs102)) ### 0 r = [0, 0] cs_50 = [[126.03125], [167.625]] print(cs_50) ############# [[126.03125],[167.625]] np.argmax(cs_50) ### 1 from sklearn.metrics.pairwise import paired_cosine_distances x = ratings.iloc[100:250, 1] df1 = ratings.iloc[1, :150] x1 = ratings.iloc[100:175, 2] df2 = ratings.iloc[2, :75] cs103 = paired_cosine_distances(x.values.reshape(1, -1), [df1])
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: if epoch != -1: if steps == -1: out_txt = f" after epoch {epoch}:" else: out_txt = f" in epoch {epoch} after {steps} steps:" else: out_txt = ":" logging.info("Binary Accuracy Evaluation of the model on " + self.name + " dataset" + out_txt) embeddings1 = model.encode(self.sentences1, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True) embeddings2 = model.encode(self.sentences2, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True) cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2) manhattan_distances = paired_manhattan_distances( embeddings1, embeddings2) euclidean_distances = paired_euclidean_distances( embeddings1, embeddings2) labels = np.asarray(self.labels) file_output_data = [epoch, steps] main_score = None for name, scores, reverse in [[ 'Cosine-Similarity', cosine_scores, True ], ['Manhatten-Distance', manhattan_distances, False], ['Euclidean-Distance', euclidean_distances, False]]: acc, acc_threshold = self.find_best_acc_and_threshold( scores, labels, reverse) f1, precision, recall, f1_threshold = self.find_best_f1_and_threshold( scores, labels, reverse) ap = average_precision_score(labels, scores * (1 if reverse else -1)) logging.info( "Accuracy with {}: {:.2f}\t(Threshold: {:.4f})". format(name, acc * 100, acc_threshold)) logging.info( "F1 with {}: {:.2f}\t(Threshold: {:.4f})". format(name, f1 * 100, f1_threshold)) logging.info("Precision with {}: {:.2f}".format( name, precision * 100)) logging.info("Recall with {}: {:.2f}".format( name, recall * 100)) logging.info("Average Precision with {}: {:.2f}\n".format( name, ap * 100)) file_output_data.extend( [acc, acc_threshold, f1, precision, recall, f1_threshold, ap]) if main_score is None: #Use AveragePrecision with Cosine-Similarity as main score main_score = ap if output_path is not None: csv_path = os.path.join(output_path, self.csv_file) if not os.path.isfile(csv_path): with open(csv_path, mode="w", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(self.csv_headers) writer.writerow(file_output_data) else: with open(csv_path, mode="a", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(file_output_data) return main_score
def _get_vectors(self, df): def mean_vector(lemmatized_text): res = list([ np.zeros(self.word2vec_vector_length), ]) for word in lemmatized_text: try: res.append(self.word2vec_model[word]) except KeyError: pass # self.logger.warning('There is no "%s" in vocabulary of the given model; ommited' % word) mean = sum(np.array(res)) / (len(res) - 1 + 1e-25) return mean if not self.word2vec_stopwords: df.lemmas_x = df.lemmas_x.map(self._remove_stop_words) df.lemmas_y = df.lemmas_y.map(self._remove_stop_words) # Add the required UPoS postags (as in the rusvectores word2vec model's vocabulary) if self.word2vec_tag_required: df.lemmas_x = df.snippet_x_locs.map(self._tag_postags) df.lemmas_y = df.snippet_y_locs.map(self._tag_postags) # Make two dataframes with average vectors for x and y, # merge them with the original dataframe df_embed_x = df.lemmas_x.apply(mean_vector).values.tolist() df_embed_y = df.lemmas_y.apply(mean_vector).values.tolist() embeddings = pd.DataFrame(df_embed_x).merge(pd.DataFrame(df_embed_y), left_index=True, right_index=True) embeddings['cos_embed_dist'] = paired_cosine_distances( df_embed_x, df_embed_y) embeddings['eucl_embed_dist'] = paired_euclidean_distances( df_embed_x, df_embed_y) embeddings['manh_embed_dist'] = paired_manhattan_distances( df_embed_x, df_embed_y) df = pd.concat( [df.reset_index(drop=True), embeddings.reset_index(drop=True)], axis=1) return df
def test_paired_manhattan_distances(): # Check the paired manhattan distances computation X = [[0], [0]] Y = [[1], [2]] D = paired_manhattan_distances(X, Y) assert_array_almost_equal(D, [1., 2.])
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: model.eval() embeddings1 = [] embeddings2 = [] labels = [] if epoch != -1: if steps == -1: out_txt = f" after epoch {epoch}:" else: out_txt = f" in epoch {epoch} after {steps} steps:" else: out_txt = ":" logging.info("Evaluation the model on " + self.name + " dataset" + out_txt) self.dataloader.collate_fn = model.smart_batching_collate for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")): features, label_ids = batch_to_device(batch, self.device) with torch.no_grad(): emb1, emb2 = [ model(sent_features)['sentence_embedding'].to( "cpu").numpy() for sent_features in features ] labels.extend(label_ids.to("cpu").numpy()) embeddings1.extend(emb1) embeddings2.extend(emb2) cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2) manhattan_distances = paired_manhattan_distances( embeddings1, embeddings2) euclidean_distances = paired_euclidean_distances( embeddings1, embeddings2) # Ensure labels are just 0 or 1 for label in labels: assert (label == 0 or label == 1) labels = np.asarray(labels) cosine_acc, cosine_threshold = self.find_best_acc_and_threshold( cosine_scores, labels, True) manhattan_acc, manhatten_threshold = self.find_best_acc_and_threshold( manhattan_distances, labels, False) euclidean_acc, euclidean_threshold = self.find_best_acc_and_threshold( euclidean_distances, labels, False) logging.info( "Accuracy with Cosine-Similarity:\t{:.2f}\t(Threshold: {:.4f})". format(cosine_acc * 100, cosine_threshold)) logging.info( "Accuracy with Manhattan-Distance:\t{:.2f}\t(Threshold: {:.4f})". format(manhattan_acc * 100, manhatten_threshold)) logging.info( "Accuracy with Euclidean-Distance:\t{:.2f}\t(Threshold: {:.4f})\n". format(euclidean_acc * 100, euclidean_threshold)) if output_path is not None: csv_path = os.path.join(output_path, self.csv_file) if not os.path.isfile(csv_path): with open(csv_path, mode="w", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(self.csv_headers) writer.writerow([ epoch, steps, cosine_acc, euclidean_acc, manhattan_acc ]) else: with open(csv_path, mode="a", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow([ epoch, steps, cosine_acc, euclidean_acc, manhattan_acc ]) if self.main_similarity == SimilarityFunction.COSINE: return cosine_acc elif self.main_similarity == SimilarityFunction.EUCLIDEAN: return euclidean_acc elif self.main_similarity == SimilarityFunction.MANHATTAN: return manhattan_acc else: raise ValueError("Unknown main_similarity value")