def acc_cosine(anchor_embedding, positive_embedding, negative_embedding): num_correct_cos_triplets = 0. num_triplets = 0. pos_cos_distance = paired_cosine_distances(anchor_embedding, positive_embedding) neg_cos_distances = paired_cosine_distances(anchor_embedding, negative_embedding) for idx in range(len(pos_cos_distance)): num_triplets += 1. if pos_cos_distance[idx] < neg_cos_distances[idx]: num_correct_cos_triplets += 1. return num_correct_cos_triplets / num_triplets
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: if epoch != -1: if steps == -1: out_txt = f" after epoch {epoch}:" else: out_txt = f" in epoch {epoch} after {steps} steps:" else: out_txt = ":" logging.info("Binary Accuracy Evaluation of the model on " + self.name + " dataset" + out_txt) embeddings1 = model.encode(self.sentences1, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True) embeddings2 = model.encode(self.sentences2, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True) cosine_scores = 1-paired_cosine_distances(embeddings1, embeddings2) manhattan_distances = paired_manhattan_distances(embeddings1, embeddings2) euclidean_distances = paired_euclidean_distances(embeddings1, embeddings2) if self.log_missclassified: misclassed = self.get_missclassified(self.sentences1, self.sentences2, self.labels, cosine_scores) self.save_misclassed(output_path, misclassed) labels = np.asarray(self.labels) file_output_data = [epoch, steps] main_score = None for name, scores, reverse in [['Cosine-Similarity', cosine_scores, True], ['Manhatten-Distance', manhattan_distances, False], ['Euclidean-Distance', euclidean_distances, False]]: acc, acc_threshold = self.find_best_acc_and_threshold(scores, labels, reverse) f1, precision, recall, f1_threshold = self.find_best_f1_and_threshold(scores, labels, reverse) ap = average_precision_score(labels, scores * (1 if reverse else -1)) logging.info("Accuracy with {}: {:.2f}\t(Threshold: {:.4f})".format(name, acc * 100, acc_threshold)) logging.info("F1 with {}: {:.2f}\t(Threshold: {:.4f})".format(name, f1 * 100, f1_threshold)) logging.info("Precision with {}: {:.2f}".format(name, precision * 100)) logging.info("Recall with {}: {:.2f}".format(name, recall * 100)) logging.info("Average Precision with {}: {:.2f}\n".format(name, ap * 100)) file_output_data.extend([acc, acc_threshold, f1, precision, recall, f1_threshold, ap]) if main_score is None: #Use AveragePrecision with Cosine-Similarity as main score main_score = ap if output_path is not None: csv_path = os.path.join(output_path, self.csv_file) if not os.path.isfile(csv_path): with open(csv_path, mode="w", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(self.csv_headers) writer.writerow(file_output_data) else: with open(csv_path, mode="a", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(file_output_data) return main_score
def paired_di(params): X, models_ = params m = sp.lil_matrix((1, len(models_) * X.shape[1])) for i, ci_dom in enumerate(models_): m[0, i * X.shape[1]:(i + 1) * X.shape[1]] = paired_cosine_distances( ci_dom, X) return m.tocsr()
def main(args): # Read the dataset df = pd.read_csv(args.file) embedder = BertWrapper(args.model_path, max_seq_length=256) pooler = PoolingLayer(embedder.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False, layer_to_use=args.layer) model = SentenceEncoder(modules=[embedder, pooler]) model.eval() evaluator = EmbeddingSimilarityEvaluator( main_similarity=SimilarityFunction.COSINE) if args.t2s: df["text_1"] = df["text_1"].apply(convert_t2s) df["text_2"] = df["text_2"].apply(convert_t2s) tmp = model.encode(df["text_1"].tolist() + df["text_2"].tolist(), batch_size=16, show_progress_bar=True) embeddings1, embeddings2 = tmp[:df.shape[0]], tmp[df.shape[0]:] spearman_score = evaluator(embeddings1, embeddings2, labels=df["similarity"].values) print(spearman_score) preds = 1 - paired_cosine_distances(embeddings1, embeddings2) df["pred"] = preds df.to_csv("cache/annotated_zero_shot_pred.csv", index=False) print(f"Pred {pd.Series(preds).describe()}") return preds, df["similarity"].values
def evaluate(model, loader, distance_fn=lambda a, b: 1 - paired_cosine_distances(a, b), device='cpu'): # def , with torch.no_grad(): model = model.to(device) model.eval() a_embeds, b_embeds = [], [] # scores = [] labels = [] for batch in loader: batch = [e.to(device) for e in batch] a_input_ids, a_input_mask, b_input_ids, b_input_mask, label = batch a_embed, b_embed = model(a_input_ids, a_input_mask), model( b_input_ids, b_input_mask) a_embeds += a_embed.tolist() b_embeds += b_embed.tolist() # score = distance_fn(a_embed.cpu().numpy(), b_embed.cpu().numpy()) # scores += score.tolist() labels += label.tolist() model.train() scores = distance_fn(np.asarray(a_embeds), np.asarray(b_embeds)) return spearmanr(labels, scores)[0]
def update(self, embeddings, labels, n, **kwargs): scores = 1 - paired_cosine_distances(embeddings[0], embeddings[1]) rows = list(zip(scores, labels)) rows = sorted(rows, key=lambda x: x[0], reverse=True) nextract = 0 ncorrect = 0 total_num = sum(labels) for i in range(len(rows) - 1): score, label = rows[i] nextract += 1 if label == 1: ncorrect += 1 if ncorrect > 0: precision = ncorrect / nextract recall = ncorrect / total_num f1 = 2 * precision * recall / (precision + recall) if f1 > self.best_f1: self.best_f1 = f1 self.best_precision = precision self.best_recall = recall self.best_threshold = (rows[i][0] + rows[i + 1][0]) / 2 #self.max_acc, self.best_threshold = get_accuracy_and_best_threshold_from_pr_curve(preds, labels) labels = [r[1] for r in rows] preds = [1 if r[0] >= self.best_threshold else 0 for r in rows] assert (len(labels) == len(preds)) self.val = self.best_f1 self.sum += self.val * n self.count += n self.avg = self.sum / self.count
def classify(self, data_v): if ((len(self.normal_labels) + len(self.mal_labels)) < 2): print( "Calling classification without enough image samples - returning zero vector" ) return np.zeros(self.number_of_classes) if (self.normal_data_storage == None): all_data = self.mal_data_storage elif (self.mal_data_storage == None): all_data = self.normal_data_storage else: all_data = self.normal_data_storage + self.mal_data_storage all_labels = self.normal_labels + self.mal_labels data_length = len(all_labels) min_distance = np.ones(self.number_of_classes) * self.max_distance if (self.similarity == 'cos'): distances = paired_cosine_distances( all_data, np.tile(data_v, (data_length, 1))) elif (self.similarity == 'l1'): distances = paired_manhattan_distances( all_data, np.tile(data_v, (data_length, 1))) else: distances = paired_euclidean_distances( all_data, np.tile(data_v, (data_length, 1))) for index_i in range(data_length): if (abs(distances[index_i]) < min_distance[all_labels[index_i]]): min_distance[all_labels[index_i]] = abs(distances[index_i]) min_distance = self.max_distance - min_distance return min_distance
def raw(args, encoder): df = pd.read_csv(args.file) if args.t2s: df["text_1"] = df["text_1"].apply(convert_t2s) df["text_2"] = df["text_2"].apply(convert_t2s) tmp = encoder.encode( df["text_1"].tolist() + df["text_2"].tolist(), batch_size=32, show_progress_bar=True ) embeddings1, embeddings2 = tmp[:df.shape[0]], tmp[df.shape[0]:] evaluator = EmbeddingSimilarityEvaluator( main_similarity=SimilarityFunction.COSINE ) spearman_score = evaluator( embeddings1, embeddings2, labels=df["similarity"].values ) print(f"Spearman: {spearman_score:.4f}") preds = 1 - paired_cosine_distances(embeddings1, embeddings2) df["pred"] = preds df.to_csv("cache/annotated_pred.csv", index=False) return preds, df["similarity"].values
def __init__(self, data_path, word2vecpath, pkl): self.pkl = pkl self.data = pd.read_csv(data_path)[['question1', 'question2']].dropna() self.vectorizer_corpus = np.reshape( self.data.values, newshape=[len(self.data.values) * 2]) self.word2vecModel = Word2Vec.load(word2vecpath) self.word2vecModel.init_sims(replace=True) # tf-idf 向量 self.vectorizer = TfidfVectorizer(max_df=0.5, max_features=3000, min_df=3, lowercase=False, decode_error='ignore').fit( self.vectorizer_corpus) self.x = np.squeeze(pd.read_csv(data_path)[['question1' ]].fillna("").values, axis=1) self.y = np.squeeze(pd.read_csv(data_path)[['question2' ]].fillna("").values, axis=1) self.X = self.vectorizer.transform(self.x) self.Y = self.vectorizer.transform(self.y) self.cosine = pairwise.paired_cosine_distances(self.X, self.Y) self.euclidean = pairwise.paired_euclidean_distances(self.X, self.Y) self.manhattan = pairwise.paired_manhattan_distances(self.X, self.Y) print(self.cosine.shape) print(self.euclidean.shape) print(self.manhattan.shape)
def HOAD(SimV1,SimV2,k,keval,m): n = SimV1.shape[0] Z = np.block([ [SimV1, m*np.identity(n)], [m*np.identity(n), SimV2] ]) D = np.diag(np.sum(Z,axis=1)) L = D - Z w,vr = eigh(L,eigvals=(0,k-1)) adscore = np.zeros((n,len(keval))) for i in range(len(keval)): Hv1 = vr[0:n,0:keval[i]] Hv2 = vr[n:,0:keval[i]] adscore[:,i] = 1 - paired_cosine_distances(Hv1,Hv2) return adscore
def ranked_answers_for_question(self, model, question: str, answers_text: List[str]): qas_examples = self.examples_for_q_answers(question, answers_text) qas_loader = self._dataloader_from_examples(list(qas_examples), model) qas_loader.collate_fn = model.smart_batching_collate embeddings1, embeddings2 = paired_embeddings_for_dataloader( self.device, model, qas_loader, get_labels=False) try: cosine_distances = 1 - (paired_cosine_distances( embeddings1, embeddings2)) manhattan_distances = -paired_manhattan_distances( embeddings1, embeddings2) euclidean_distances = -paired_euclidean_distances( embeddings1, embeddings2) dot_products = [ np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2) ] all_dists = [ cosine_distances, manhattan_distances, euclidean_distances, dot_products ] except Exception as e: print(embeddings1) print(embeddings2) raise e return np.argsort(all_dists[int(self.main_similarity.value)])
def raw(args, model): df = pd.read_csv(f"data/LCQMC/{args.filename}", delimiter="\t", header=None, names=["text_1", "text_2", "label"]) tmp = model.encoder.encode(df["text_1"].tolist() + df["text_2"].tolist(), batch_size=32, show_progress_bar=True) embeddings1, embeddings2 = tmp[:df.shape[0]], tmp[df.shape[0]:] evaluator = EmbeddingSimilarityEvaluator( main_similarity=SimilarityFunction.COSINE) spearman_score = evaluator(embeddings1, embeddings2, labels=df["label"].values) print(f"Spearman: {spearman_score:.4f}") preds = 1 - paired_cosine_distances(embeddings1, embeddings2) df["pred"] = preds df.to_csv(f"cache/{Path(args.filename).stem}_pred.csv", index=False) return preds, df["label"].values
def cosDistance(fitted, b, h): bodyTfidf = fitted.transform(b) headlineTfidf = fitted.transform(h) # print("Article Body") #print(bodyTfidf) #print("Headline") #print(headlineTfidf) cosDist = paired_cosine_distances(bodyTfidf, headlineTfidf) return cosDist
def evaluate_sbert(model, batch_size=16): sts_dataset_path = 'datasets/stsbenchmark.tsv.gz' test_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: if row['split'] == 'test': score = float( row['score']) / 5.0 #Normalize score to range 0 ... 1 test_samples.append( InputExample(texts=[row['sentence1'], row['sentence2']], label=score)) sentences1 = [] sentences2 = [] scores = [] examples = test_samples for example in examples: sentences1.append((example.texts[0], 'none')) sentences2.append((example.texts[1], 'none')) scores.append(example.label) _, embeddings1 = model.forward(sentences1, checkpoint=False) _, embeddings2 = model.forward(sentences2, checkpoint=False) labels = scores cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2)) manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2) euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2) dot_products = [ np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2) ] eval_pearson_cosine, _ = pearsonr(labels, cosine_scores) eval_spearman_cosine, _ = spearmanr(labels, cosine_scores) eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances) eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances) eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances) eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances) eval_pearson_dot, _ = pearsonr(labels, dot_products) eval_spearman_dot, _ = spearmanr(labels, dot_products) print("Cosine-Similarity :\tPearson: {:.4f}\tSpearman: {:.4f}".format( eval_pearson_cosine, eval_spearman_cosine)) print("Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format( eval_pearson_manhattan, eval_spearman_manhattan)) print("Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format( eval_pearson_euclidean, eval_spearman_euclidean)) print("Dot-Product-Similarity:\tPearson: {:.4f}\tSpearman: {:.4f}".format( eval_pearson_dot, eval_spearman_dot))
def stables(mod_a, mod_b, top_n=20): """ Return <top_n> vocabulary elements with lower cosine similarity between models """ vocab = shared_voc(mod_a, mod_b) sims = 1 - paired_cosine_distances(mod_a.wv[vocab], mod_b.wv[vocab]) sim_dict = {k: v for k, v in zip(vocab, sims)} return sorted(sim_dict, key=sim_dict.get, reverse=True)[:top_n]
def correlation_score(features, labels, size): from sklearn.metrics.pairwise import paired_cosine_distances from scipy.stats import pearsonr logging.info("Sampling 2x{} stimuli from a total of {}".format(size, len(labels))) indices = np.array(random.sample(range(len(labels)), size*2)) y = labels[indices] x = features[indices] y_sim = y[: size] == y[size :] x_sim = 1 - paired_cosine_distances(x[: size], x[size :]) return pearsonr(x_sim, y_sim)[0]
def make_comparison_fig( group2embedding_matrices: Dict[str, List[np.ndarray]], vmin: float = 0.0, ) -> plt.Figure: """ Returns fig showing similarity matrix of probe similarity matrices of multiple models """ group_names = sorted(group2embedding_matrices.keys()) # get a flat list of embedding matrices, preserving group order embedding_matrices_flat = [] group_names_flat = [] for k, v in sorted(g2embedding_matrices.items()): embedding_matrices_flat.extend(v) group_names_flat.extend([k] * len(v)) avg_sims = np.zeros( (len(embedding_matrices_flat), len(embedding_matrices_flat))) for i, embeddings_i in enumerate(embedding_matrices_flat): for j, embeddings_j in enumerate(embedding_matrices_flat): avg_sims[i, j] = 1 - paired_cosine_distances( embeddings_i, embeddings_j).mean() # fig fig, ax = plt.subplots(figsize=config.Fig.fig_size, dpi=config.Fig.dpi) mask = np.zeros_like(avg_sims, dtype=np.bool) mask[np.triu_indices_from(mask, 1)] = True sns.heatmap(avg_sims, ax=ax, square=True, annot=False, annot_kws={"size": 5}, cbar_kws={"shrink": .5}, vmin=vmin, vmax=1.0, cmap='jet') # , mask=mask # colorbar cbar = ax.collections[0].colorbar cbar.set_ticks([vmin, 1.0]) cbar.set_ticklabels([str(vmin), '1.0']) cbar.set_label('Similarity between Semantic Spaces') # ax (needs to be below plot for axes to be labeled) ax.set_yticks(np.arange(len(group_names_flat)) + 0.5) ax.set_xticks(np.arange(len(group_names_flat)) + 0.5) ax.set_yticklabels(group_names_flat, rotation=0) ax.set_xticklabels(group_names_flat, rotation=90) plt.tight_layout() return fig
def raw(args): df = pd.read_csv(f"data/LCQMC/{args.filename}", delimiter="\t", header=None, names=["text_1", "text_2", "label"]) g = tf.Graph() with g.as_default(): text_input = tf.placeholder(dtype=tf.string, shape=[None]) embed = hub.Module( "https://tfhub.dev/google/universal-sentence-encoder-multilingual/1" ) embedded_text = embed(text_input) init_op = tf.group( [tf.global_variables_initializer(), tf.tables_initializer()]) g.finalize() # Initialize session. session = tf.Session(graph=g) session.run(init_op) # Compute embeddings. embs_1, embs_2 = [], [] for i in range(0, len(df), args.batch_size): embs_1.append( session.run(embedded_text, feed_dict={ text_input: df.text_1.values[i:i + args.batch_size] })) embs_2.append( session.run(embedded_text, feed_dict={ text_input: df.text_2.values[i:i + args.batch_size] })) embeddings1 = np.concatenate(embs_1) embeddings2 = np.concatenate(embs_2) evaluator = EmbeddingSimilarityEvaluator( main_similarity=SimilarityFunction.COSINE) spearman_score = evaluator(embeddings1, embeddings2, labels=df["label"].values) print(f"Spearman: {spearman_score:.4f}") preds = 1 - paired_cosine_distances(embeddings1, embeddings2) df["pred"] = preds df.to_csv(f"cache/{Path(args.filename).stem}_pred.csv", index=False) return preds, df["label"].values
def getRecommendation(data): dataDict = json.loads(data) length = len(dataDict['pic']) weights = np.array([sum(col)/length for col in zip(*[pictureDict[int(picdata)]['weights'] for picdata in dataDict['pic']])]).reshape(1,-1) minHeap = [] for restaurant, thatWeights in restaurantWeights.items(): heapq.heappush(minHeap, (-paired_cosine_distances(weights, np.array(thatWeights).reshape(1,-1)), restaurant)) if len(minHeap) > 10: heapq.heappop(minHeap) res = [] while minHeap: res.append(heapq.heappop(minHeap)[1]) return {'restaurants': [getElement(i) for i in res[::-1]] }
def get_dists(gen, fname): qs, ds, ixs = [], [], [] for qdi in gen: qs.append(' '.join(qdi.q_terms)) ds.append(' '.join(qdi.d_terms)) ixs.append(qdi.ixs[:3]) if len(qs) == 0: return qvecs = model.transform(qs) dvecs = model.transform(ds) dists = paired_cosine_distances(qvecs, dvecs) np.savez(fname, vals=np.hstack([ixs, np.array([dists]).T]))
def __call__(self, model: 'SequentialSentenceEmbedder', output_path: str = None, epoch: int = -1, steps: int = -1) -> float: model.eval() embeddings1 = [] embeddings2 = [] labels = [] if epoch != -1: if steps == -1: out_txt = f" after epoch {epoch}:" else: out_txt = f" in epoch {epoch} after {steps} steps:" else: out_txt = ":" logging.info("Evaluation the model on " + self.name + " dataset" + out_txt) self.dataloader.collate_fn = model.smart_batching_collate iterator = self.dataloader if self.show_progress_bar: iterator = tqdm(iterator, desc="Convert Evaluating") for step, batch in enumerate(iterator): features, label_ids = batch_to_device(batch, self.device) with torch.no_grad(): emb1, emb2 = [ model(sent_features)['sentence_embedding'].to( "cpu").numpy() for sent_features in features ] labels.extend(label_ids.to("cpu").numpy()) embeddings1.extend(emb1) embeddings2.extend(emb2) try: cosine_scores = 1 - (paired_cosine_distances( embeddings1, embeddings2)) except Exception as e: print(embeddings1) print(embeddings2) raise (e) mse_cosine = mean_squared_error(labels, cosine_scores) mae_cosine = mean_absolute_error(labels, cosine_scores) return (mse_cosine, mae_cosine)
def update(self, embeddings, labels, n, **kwargs): assert embeddings.shape[0] == 2 scores = 1 - paired_cosine_distances(embeddings[0], embeddings[1]) items = list(zip(scores, labels)) sorted_items = sorted(items, key=lambda x: x[0], reverse=True) labels = [i[1] for i in items] scores = [i[0] for i in items] if self.return_predictions: self.all_predictions.extend(scores) self.all_labels.extend(labels) self.val = metrics.average_precision_score(labels, scores) self.sum += self.val * n self.count += n self.avg = self.sum / self.count
def kernel_function(self, x1, x2): features = [] # linear kernel: # Cosine distance features += np.squeeze(1 - pairwise.paired_cosine_distances(x1, x2)[0]), # Manhanttan distance features += pairwise.paired_manhattan_distances(x1, x2)[0], # Euclidean distance features += pairwise.paired_euclidean_distances(x1, x2)[0], # Chebyshev distance features += pairwise.pairwise_distances(x1, x2, metric="chebyshev")[0][0], # stat kernel: # Pearson coefficient pearson = stats.pearsonr(np.squeeze(np.asarray(x1)), np.squeeze(np.asarray(x2)))[0] features += 0 if np.isnan(pearson) else pearson, # Spearman coefficient spearman = stats.spearmanr(x1, x2, axis=1).correlation features += 0 if np.isnan(spearman) else spearman, # Kendall tau coefficient kendall = stats.kendalltau(x1, x2).correlation features += 0 if np.isnan(kendall) else kendall, # non-linear kernel: # polynomial features += pairwise.polynomial_kernel(x1, x2, degree=2)[0][0], # rbf features += pairwise.rbf_kernel(x1, x2)[0][0], # laplacian features += pairwise.laplacian_kernel(x1, x2)[0][0], # sigmoid features += pairwise.sigmoid_kernel(x1, x2)[0][0], return features
def evaluate(wpairs, inv, M): A = np.full(shape=len(wpairs), fill_value=0.5) l = [] # list of vectors pairs f = [] # filter for known pairs for w1, w2 in wpairs: u = get_vec(w1, inv, M) v = get_vec(w2, inv, M) if u is None or v is None: f.append(False) else: f.append(True) l.append((u, v)) print(f.count(False), 'missing results (use 0.5)', file=sys.stderr) I, J = map(vstack, zip(*l)) dst = paired_cosine_distances(I, J) A[np.array(f)] = 1 - dst return A
def _get_vectors(self, df): def mean_vector(lemmatized_text): res = list([ np.zeros(self.word2vec_vector_length), ]) for word in lemmatized_text: try: res.append(self.word2vec_model[word]) except KeyError: second_candidate = self._synonyms_vocabulary.get(word) if second_candidate: res.append(self.word2vec_model[second_candidate]) elif self.word2vec_stopwords and ('NOUN' in word or 'VERB' in word): # print(f'There is no "{word}" in vocabulary of the given model; ommited', file=sys.stderr) pass mean = sum(np.array(res)) / (len(res) - 1 + 1e-25) return mean if not self.word2vec_stopwords: df.lemmas_x = df.lemmas_x.map(self._remove_stop_words) df.lemmas_y = df.lemmas_y.map(self._remove_stop_words) # Add the required UPoS postags (as in the rusvectores word2vec model's vocabulary) if self.word2vec_tag_required: df.lemmas_x = df.snippet_x_locs.map(self._tag_postags_morph) df.lemmas_y = df.snippet_y_locs.map(self._tag_postags_morph) # Make two dataframes with average vectors for x and y, # merge them with the original dataframe df_embed_x = df.lemmas_x.apply(mean_vector).values.tolist() df_embed_y = df.lemmas_y.apply(mean_vector).values.tolist() embeddings = pd.DataFrame(df_embed_x).merge(pd.DataFrame(df_embed_y), left_index=True, right_index=True) embeddings['cos_embed_dist'] = paired_cosine_distances( df_embed_x, df_embed_y) embeddings['eucl_embed_dist'] = paired_euclidean_distances( df_embed_x, df_embed_y) df = pd.concat( [df.reset_index(drop=True), embeddings.reset_index(drop=True)], axis=1) return df
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: if epoch != -1: if steps == -1: out_txt = f" after epoch {epoch}:" else: out_txt = f" in epoch {epoch} after {steps} steps:" else: out_txt = ":" logger.info("Binary Accuracy Evaluation of the model on " + self.name + " dataset" + out_txt) embeddings1 = model.encode(self.sentences1, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True) embeddings2 = model.encode(self.sentences2, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True) cosine_scores = 1-paired_cosine_distances(embeddings1, embeddings2) manhattan_distances = paired_manhattan_distances(embeddings1, embeddings2) euclidean_distances = paired_euclidean_distances(embeddings1, embeddings2) labels = np.asarray(self.labels) file_output_data = [epoch, steps] for s1, s2 in zip(self.sentences1, self.sentences2): print(s1, s2) output = model([s1, s2]) main_score = None for name, scores, reverse in [['Cosine-Similarity', cosine_scores, True], ['Manhatten-Distance', manhattan_distances, False], ['Euclidean-Distance', euclidean_distances, False]]: acc, acc_threshold = self.find_best_acc_and_threshold(scores, labels, reverse) f1, precision, recall, f1_threshold = self.find_best_f1_and_threshold(scores, labels, reverse) ap = average_precision_score(labels, scores * (1 if reverse else -1)) logger.info("Accuracy with {}: {:.2f}\t(Threshold: {:.4f})".format(name, acc * 100, acc_threshold)) logger.info("F1 with {}: {:.2f}\t(Threshold: {:.4f})".format(name, f1 * 100, f1_threshold)) logger.info("Precision with {}: {:.2f}".format(name, precision * 100)) logger.info("Recall with {}: {:.2f}".format(name, recall * 100)) logger.info("Average Precision with {}: {:.2f}\n".format(name, ap * 100)) file_output_data.extend([acc, acc_threshold, f1, precision, recall, f1_threshold, ap]) if main_score is None: #Use AveragePrecision with Cosine-Similarity as main score main_score = ap return main_score
def main(args): model = SentenceTransformer(args.model_name) if args.device == 'cuda' and torch.cuda.is_available(): model.cuda() ids = [] src_sentences = [] tgt_sentences = [] programs = [] with open(args.input_file, 'r') as fin: for i, line in enumerate(fin): row = list(map(lambda part: part.strip(), line.split('\t'))) ids.append(row[0]) src_sentences.append(row[1]) tgt_sentences.append(row[2]) if len(row) > 3: programs.append(row[3]) if args.subsample != -1 and i >= args.subsample: break embeddings1 = model.encode(src_sentences, batch_size=args.batch_size, show_progress_bar=True, convert_to_numpy=True) embeddings2 = model.encode(tgt_sentences, batch_size=args.batch_size, show_progress_bar=True, convert_to_numpy=True) cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2)) with open(args.output_file, 'w') as fout: for i in range(len(ids)): id_, src, tgt, score = ids[i], src_sentences[i], tgt_sentences[ i], cosine_scores[i] prog = None if programs: prog = programs[i] fout.write('\t'.join([ id_, src, tgt, '{:0.4f}'.format(score), prog if prog else '' ]) + '\n')
def test_neuron_cosine_sim(X_neuron, adv_sig, neuron_mask=None): nb_sample = X_neuron.shape[0] # neuron_mask_expand = np.expand_dims(neuron_mask, axis=0) # neuron_mask_repeat = np.repeat(neuron_mask_expand, nb_sample, axis=0) adv_sig_repeat = np.expand_dims(adv_sig, axis=0) adv_sig_repeat = np.repeat(adv_sig_repeat, nb_sample, axis=0) adv_sig_flatten = np.reshape(adv_sig_repeat, (nb_sample, -1)) X_neuron_mask = X_neuron X_flatten = np.reshape(X_neuron_mask, (nb_sample, -1)) cosine_sim = 1 - paired_cosine_distances(X_flatten, adv_sig_flatten) # print(list(np.percentile(cosine_sim, [0, 5, 25, 50, 75, 95, 100]))) return cosine_sim
def predict(self, data_df): if "text_a" in data_df.columns and "text_b" in data_df.columns: if self.args.do_lower_case: data_df.loc[:, 'text_a'] = data_df['text_a'].str.lower() data_df.loc[:, 'text_b'] = data_df['text_b'].str.lower() # Compute embedding for text pairs embeddings1 = self.model.encode(list(data_df["text_a"]), convert_to_numpy=True) embeddings2 = self.model.encode(list(data_df["text_b"]), convert_to_numpy=True) cosine_scores = 1 - (paired_cosine_distances( embeddings1, embeddings2)) return cosine_scores else: raise KeyError( 'Prediction data processing - Required columns not found!')
def predict(self, to_predict, verbose=True): sentences1 = [] sentences2 = [] for text_1, text_2 in to_predict: sentences1.append(text_1) sentences2.append(text_2) embeddings1 = self.encode(sentences1, show_progress_bar=verbose, convert_to_numpy=True) embeddings2 = self.encode(sentences2, show_progress_bar=verbose, convert_to_numpy=True) cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2)) return cosine_scores