def __init__(self, config): super(DependencyBaselineModel, self).__init__() self.device = config[DEVICE] self.hid_dim = config[LSTM_HID] * 2 self.num_lstm_pointer = 1 self.config = config self.ptr_criterion = nn.CrossEntropyLoss(reduction="sum", ignore_index=-100) self.link_predictor = CosineSimilarity(dim=2) self.root_clf = nn.Sequential(nn.Linear(self.hid_dim, self.hid_dim), nn.GELU(), nn.Linear(self.hid_dim, 1)) self.edu_embed_model = EduEmbeddingModel(config) self.lm_decoder = LMDecodingModel(config) if not self.config[USE_SEP_ENCODER]: self.pointer_net = PointerNet(self.hid_dim, \ self.hid_dim, \ self.num_lstm_pointer, \ self.config[DROPOUT]) self.root_embed = nn.Parameter(th.rand(1, self.hid_dim), requires_grad=True) self.alpha = 0.5
def evaluate_embeddings(embedding, vocab: Vocabulary): cosine = CosineSimilarity(dim=0) simlex999 = read_simlex999() sims_pred = [] oov_count = 0 for word1, word2, sim in simlex999: word1_id = vocab.get_token_index(word1, 'token_in') if word1_id == 1: sims_pred.append(0.) oov_count += 1 continue word2_id = vocab.get_token_index(word2, 'token_in') if word2_id == 1: sims_pred.append(0.) oov_count += 1 continue sim_pred = cosine(embedding.weight[word1_id], embedding.weight[word2_id]).item() sims_pred.append(sim_pred) assert len(sims_pred) == len(simlex999) print('# of OOV words: {} / {}'.format(oov_count, len(simlex999))) return spearmanr(sims_pred, [sim for _, _, sim in simlex999])
def __init__(self, data): super(Bert_Comparing, self).__init__() self.question_bert_embedding = BertCharEmbedding(data.bert_path, data.requires_grad) self.path_bert_embedding = BertCharEmbedding(data.bert_path, data.requires_grad) self.args = data self.similarity = CosineSimilarity(dim=1)
def computeTask_(index, symptom, combinedOutputFolder, meanEmb, similarityThreshold): symptomToken = tokenizer.encode(symptom)[1] cos = CosineSimilarity(dim=1, eps=1e-6) filename = os.path.join(combinedOutputFolder, f"{index+6}.pkl") subDict = pickle.load(open(filename, 'rb')) IDList = subDict['id'] tokenList = subDict['token'] embList = subDict['emb'] # sim = np.round(cosine_similarity(embList, meanEmb.reshape(1,-1)).reshape(-1),4) arrA = torch.from_numpy(meanEmb.reshape(1, -1)) arrB = torch.from_numpy(embList) # arrA = torch.from_numpy(meanEmb.reshape(1,-1)).cuda() # arrB = torch.from_numpy(embList).cuda() sim = cos(arrA, arrB).cpu().numpy().reshape(-1) sim = np.round(sim, 4) index = np.where([sim > similarityThreshold])[1] tokenList_ = tokenList[index] IDList_ = IDList[index] simList = sim[index] out = [(x, y, z) for x, y, z in zip(tokenList_, simList, IDList_)] return out
def create_classes_data_frame(dataset_name, distance="cosine", tsne_dimension=2): """Create a new classes dataframe for the specified dataset. The dataset must be registered in the project settings. The data frame is pickled before function return, to prevent re-calculating things. Args: dataset_name: the name of the dataset distance: which distance function to be used for nearest neighbor computation. Either 'cosine' or 'pairwise' (Default value = "cosine") tsne_dimension: the dimensions for the lower dimensional vector projections (Default value = 2) Returns: a pandas DataFrame with "class", "vector" (document embeddings) and "tsne" columns """ dataset_dir = DATA_SOURCES[dataset_name]["images"] paths = classes_set(dataset_dir) classes = pd.DataFrame(columns=["class", "vector", "tsne"]) classes["classes"] = sorted(list(paths)) tqdm.pandas(desc="Removing special characters.") classes["classes"] = classes["classes"].progress_apply(lambda cls: " ".join(re.split(r"[_\-]", cls))) tqdm.pandas(desc="Applying full clean.") classes["classes"] = classes["classes"].progress_apply(full_clean) tqdm.pandas(desc="Creating document vectors.") vectors = torch.tensor(np.vstack(classes["classes"].progress_apply(document_vector))) classes["vectors"] = vectors p_dist = PairwiseDistance(p=2) if distance == "pairwise" else CosineSimilarity() classes["distances"] = p_dist( # distance from every node to every node vectors.repeat_interleave(vectors.shape[0], 0), # each index repeated num_edges times vectors.repeat(vectors.shape[0], 1), # the index range repeated num_edges times ).reshape( vectors.shape[0], -1 ) # convert to 2D matrix with shape [vectors.shape[0], vectors.shape[0]] classes["tsne"] = torch.tensor(TSNE(n_components=tsne_dimension).fit_transform(vectors)) pickle.dump(classes, open(os.path.join(dataset_dir, "classes.pickle"), "wb")) return classes
def __init__(self, input_size: int, dropout: float = None): super().__init__(input_size, dropout, key='cosine-regression', supports_compressed_streamlines=False, loss_description='negative cosine similarity') # Loss will be applied on the last dimension. self.loss = CosineSimilarity(dim=-1)
def getSimilarWords(tokenizer, combinedOutputFolder, symptom, meanEmb, similarityThreshold = 0.3, numThreshold = 150000, numComp = 10000): output = [] symptomToken = tokenizer.encode(symptom)[1] fileList = os.listdir(combinedOutputFolder) cos = CosineSimilarity(dim=1, eps=1e-6) examineCount = 0 for i in tqdm(range(len(fileList))): if examineCount >= numThreshold: break filename = os.path.join(combinedOutputFolder, f"{i}.pkl") subDict = pickle.load(open(filename,'rb')) IDList = subDict['id'] tokenList = subDict['token'] embList = subDict['emb'] arrA = torch.from_numpy(meanEmb.reshape(1,-1)).to(device).type(torch.cuda.FloatTensor) arrB = torch.from_numpy(embList).to(device).type(torch.cuda.FloatTensor) # arrA = torch.from_numpy(meanEmb.reshape(1,-1)).to(device) # arrB = torch.from_numpy(embList).to(device) sim = cos(arrA,arrB).cpu().numpy().reshape(-1) del arrA del arrB sim = np.round(sim,4) index= np.where([sim> similarityThreshold])[1] tokenList_ = tokenList[index] IDList_ = IDList[index] simList = sim[index] out = [(x,y,z) for x,y,z in zip(tokenList_, simList, IDList_)] print(len(out)) output += out examineCount += numComp return output
def loss_function(origin, target, random_1, random_2, random_3, random_4): cos = CosineSimilarity(dim=1, eps=1e-6) sim_1 = cos(origin, target).unsqueeze(1) #batch_size * 1 sim_2 = cos(origin, random_1).unsqueeze(1) sim_3 = cos(origin, random_2).unsqueeze(1) sim_4 = cos(origin, random_3).unsqueeze(1) sim_5 = cos(origin, random_4).unsqueeze(1) sim = torch.cat((sim_1, sim_2, sim_3, sim_4, sim_5), dim=1) #batch_size * compare_size logSoft = LogSoftmax(dim=1) output = torch.mean(logSoft(sim)[:, 0]) return -output
def get_synonyms(token: str, embedding: Model, vocab: Vocabulary, num_synonyms: int = 10): """Given a token, return a list of top N most similar words to the token.""" token_id = vocab.get_token_index(token, 'token_in') token_vec = embedding.weight[token_id] cosine = CosineSimilarity(dim=0) sims = Counter() for index, token in vocab.get_index_to_token_vocabulary('token_in').items(): sim = cosine(token_vec, embedding.weight[index]).item() sims[token] = sim return sims.most_common(num_synonyms)
def __init__(self, input_size: int, dropout: float = None): # Prepare the dropout, Relu, loop: super().__init__(dropout) # Layers hidden_size = ceil(input_size / 2) h1 = Linear(input_size, hidden_size) h2 = Linear(hidden_size, 3) self.layers = [h1, h2] # Loss will be applied on the last dimension. self.loss = CosineSimilarity(dim=-1)
def __init__(self, tokenizer, **kw): super().__init__(**kw) self.encoder = BertModel.from_pretrained('prajjwal1/bert-tiny') self.decoder = AutoModelForCausalLM.from_pretrained( 'prajjwal1/bert-tiny', ) self.decoder.config.is_decoder = True self.tokenizer = tokenizer self.encoder.resize_token_embeddings(len(self.tokenizer)) self.decoder.resize_token_embeddings(len(self.tokenizer)) self.cs = CosineSimilarity()
def __init__( self, collection_name: str, stopwords_list, text_transformer: TextTransformer, weighting_model: str = "tw-idf", ): self.collection = Collection(collection_name, stopwords_list, weighting_model, text_transformer) self.weighting_model = weighting_model self.stopwords = stopwords_list self.__text_transformer = text_transformer self.__cos = CosineSimilarity(dim=0, eps=1e-6)
class TripletMarginCosLoss(Function): """Triplet loss function. """ def __init__(self, margin): super(TripletMarginCosLoss, self).__init__() self.margin = margin self.pdist = CosineSimilarity(dim=1, eps=1e-6) # norm 2 def forward(self, anchor, positive, negative): d_p = self.pdist.forward(anchor, positive) d_n = self.pdist.forward(anchor, negative) dist_hinge = torch.clamp(self.margin - d_p + d_n, min=0.0) # loss = torch.sum(dist_hinge) loss = torch.mean(dist_hinge) return loss
def get_synonyms(self, token: str, num_synonyms: int = 30): """Given a token, return a list of top N most similar words to the token.""" vocab = self._model.vocab embedding = self._model.embedding_target token_id = vocab.get_token_index(token, 'token_target') token_vec = embedding.weight[token_id] cosine = CosineSimilarity(dim=0) sims = Counter() for index, token in vocab.get_index_to_token_vocabulary( 'token_target').items(): sim = cosine(token_vec, embedding.weight[index]).item() sims[token] = sim return sims.most_common(num_synonyms)
def __init__(self, args, text_data, writer, summary_dir, out): self.args = args self.text_data = text_data self.writer = writer self.summary_dir = summary_dir self.out = out self.stopwords = set(stopwords.words('english')) self.mode = None self.samples = None self.cosine_similarity = CosineSimilarity(dim=1, eps=1e-6) self.global_step = 0 if self.args.attack == 'deepfool': self.attack = DeepFool(args=self.args, num_classes=2, max_iters=20) else: print('Attack {} not recognized'.format(self.args.attack))
def __init__(self, vocab_size, embedding_dim): """Initializes model layers. Args: vocab_size (int): Number of tokens in corpus. This is used to init embeddings. embedding_dim (int): Dimension of embedding vector. """ super().__init__() self._embedding_dim = embedding_dim self.encoder_in = Encoder(vocab_size, embedding_dim) self.encoder_out = Encoder(vocab_size, embedding_dim) self.linear = Linear(embedding_dim, embedding_dim, bias=False) self.similarity = CosineSimilarity(dim=2) self.softmax = Softmax(dim=2)
def forward(self, input, hidden, encoder_outputs): embedded = self.embedding(input).view(1, self.batch_Size, -1) embedded = self.dropout(embedded) ### batch_size*(embed_size+hidden_size) output = F.relu(embedded) output, hidden = self.gru(output, hidden) ### hidden (1, batch_size, hidden_size) ### out (seq_len, batch_size, hidden_size) ### attn_applied (batch_size, 1, hidden_size) #output_hidden = torch.cat((output[0], hidden[0]), 1) ### using output to match all the encoder_outputs ### output[0] (batch_size, hidden_size) hidden[0] (batch_size, hidden_size) ### output_hidden (batch_size, 2*hidden_size) cos = CosineSimilarity(dim=2, eps=1e-7) similar = cos(output.repeat(self.max_length, 1, 1), encoder_outputs) ### out (seq_len, batch_size, hidden_size) ### encoder out_put (seq_len, batch_size, hidden_size) #print('similar.shape', similar.shape) #print('similar', similar) ### similar (max_len, hidden) #attn_weights = F.softmax( # self.attn(similar), dim=1) attn_weights = F.softmax(similar, dim=0) #print('attn_weights.shape', attn_weights) attn_weights = torch.t(attn_weights) #print('attn_weights.shape', attn_weights) ### attn_weights (batch_size*max_length) threeDattn_weights = attn_weights.unsqueeze(0) ### threeDattn_weights (1, 3, 300) (1, batch_size, max_length) #threeDencoder_outputs = encoder_outputs.unsqueeze(0) ### threeDencoder_outputs (300, 3, 300) (maxLen, batch_size, hidden_size) threeDattn_weights = threeDattn_weights.permute(1, 0, 2) threeDencoder_outputs = encoder_outputs threeDencoder_outputs = threeDencoder_outputs.permute(1, 0, 2) attn_applied = torch.bmm(threeDattn_weights, threeDencoder_outputs) ### (3,1,300)x(3,300,300) = (3,1,300) ### embedded (seq_len, batch_size, feature) IntoLinear = torch.cat((output[0], attn_applied.squeeze(1)), 1) ### output[0] (batch_size, hidden_size) attn_applied (batch_size, hidden_size) #IntoLinear (batch_size, 2*hidden_size) output = F.log_softmax(self.out(IntoLinear), dim=1) return output, hidden, attn_weights
def discovery(embedding, vocab, chord_a, chord_b, chord_c, num_output=10): a_id = vocab.get_token_index(chord_a) b_id = vocab.get_token_index(chord_b) c_id = vocab.get_token_index(chord_c) vec_a = embedding.weight[a_id] vec_b = embedding.weight[b_id] vec_c = embedding.weight[c_id] cosine = CosineSimilarity(dim=0) sims = Counter() vec = vec_b - vec_a + vec_c for index, token in vocab.get_index_to_token_vocabulary().items(): sim = cosine(vec, embedding.weight[index]).item() sims[token] = sim return sims.most_common(num_output)
def evaluate(model, eval_feature_path, enrollment_path, eval_path, annotation_path): model.eval() _, grp_embeddings = enrollment(model, eval_feature_path, enrollment_path) indices, _ = readTestPaths(eval_path) # 组编号-文件名列表 cosine_similarity = CosineSimilarity(dim=1) cos_similarity = {} for key, value in indices.items(): # 组编号-文件名列表 for path in value: out = calculateOneEmbedding(model, eval_feature_path, path) # test embedding cosine = cosine_similarity( out, grp_embeddings[key] ) # grp_embeddings[key] is the corresponding enroll embeddings cos_similarity[path] = max(cosine).item() # 距离越远越好 accuracy, threshold = acc(cos_similarity, annotation_path) print('ACCURACY: ', accuracy) return accuracy, threshold
def verify(model, data_loader, sub_name, test=False): model.eval() cos_sim = CosineSimilarity(dim=1) cosine_similarity = torch.Tensor([]) true_similarity = torch.Tensor([]) if not test: for i, (x, y) in enumerate(data_loader): img1 = x[0].to(device) img2 = x[1].to(device) y = y.to("cpu") out1 = model(img1)[0].to("cpu") out2 = model(img2)[0].to("cpu") cosine_similarity = torch.cat( (cosine_similarity.detach(), cos_sim(out1, out2).detach()), 0) true_similarity = torch.cat((true_similarity, y), 0) del x, y, img1, img2, out1, out2 torch.cuda.empty_cache() model.train() try: AUC = roc_auc_score( true_similarity.type(torch.DoubleTensor), cosine_similarity.type(torch.DoubleTensor).detach().numpy()) return AUC except Exception as e: print(e) return -1 else: for i, (x) in enumerate(data_loader): img1 = x[0].to(device) img2 = x[1].to(device) out1 = model(img1)[0].to("cpu") out2 = model(img2)[0].to("cpu") cosine_similarity = torch.cat( (cosine_similarity.detach(), cos_sim(out1, out2).detach()), 0) if i % 1000 == 0: print("Verification", i, end='\r') model.train() return write_submission(sub_name, cosine_similarity)
class TestEncoder: cosine = CosineSimilarity(dim=-1) # The base model will take longer than the small model, which triggers a test timing error. # Turn off deadlines to avoid this. @settings(deadline=None) @given(sphereize=booleans()) def test_encoder(self, inputs: List[str], inputs_filepath: Path, encoder: Encoder, sphereize: bool) -> None: # The relative ranking should not change if sphereize is True/False, so run tests with both. encoder._sphereize = sphereize # Run three distinct tests, which should cover all use cases of Encoder: # 1. A List[str] input where batch_size is not None. embeddings = encoder(inputs, batch_size=len(inputs)) embeddings = torch.from_numpy(embeddings) # These are hard-coded examples that should have the highest cosine similarity. assert torch.topk(self.cosine(embeddings[2], embeddings), k=2)[-1][-1].item() == 3 assert torch.topk(self.cosine(embeddings[6], embeddings), k=2)[-1][-1].item() == 7 # 2. A str input where batch_size is None. Check that the expected UserWarning is raised. embeddings = [] for text in inputs: if sphereize: with pytest.warns(UserWarning): embeddings.append(encoder(text, batch_size=None)) else: embeddings.append(encoder(text, batch_size=None)) embeddings = torch.as_tensor(embeddings).squeeze(1) assert torch.topk(self.cosine(embeddings[2], embeddings), k=2)[-1][-1].item() == 3 assert torch.topk(self.cosine(embeddings[6], embeddings), k=2)[-1][-1].item() == 7 # 3. A filepath input that points to file with one example per line. embeddings = encoder(inputs_filepath, batch_size=len(inputs)) embeddings = torch.from_numpy(embeddings) assert torch.topk(self.cosine(embeddings[2], embeddings), k=2)[-1][-1].item() == 3 assert torch.topk(self.cosine(embeddings[6], embeddings), k=2)[-1][-1].item() == 7
def forward(self, x1, x2, labels): """ Args: x: feature matrix with shape (batch_size, feat_dim). labels: ground truth labels with shape (batch_size). """ # batch_size = x1.shape[0] # dist = torch.zeros(batch_size) # for i in range(batch_size): # dist[i] = torch.norm(x1[i]-x2[i]) dist = CosineSimilarity(dim=1)(x1, x2) # total_loss = labels*dist + (1-labels)*(self.margin-dist) # for euclidean distance total_loss = (1 - labels) * dist + labels * (self.margin - dist) loss = total_loss.mean() return loss
def get_top_k(query_embedding, queried_embeddings, k, distance): """Returns the distances and indices of the k nearest embeddings in the `queried_embeddings` tensor to the `query_embedding` tensor. Args: query_embedding: tensor with the embedding of the query image. queried_embeddings: tensor with the stacked embeddings of the queried dataset. k: the number of most similar images to be returned. distance: which distance function to be used for nearest neighbor computation. Either 'cosine' or 'pairwise' Returns: the closest k embeddings in the `embeddings` tensor to the `query_embedding`. A 2-tuple with shape `[k]` tensor with their distances and indices are returned (respectively). """ p_dist = PairwiseDistance( p=2) if distance == "pairwise" else CosineSimilarity() distances = p_dist(queried_embeddings, query_embedding) return torch.topk(distances, k) # return the top k results
def verify(model, data_loader, sub_name, test=False): model.eval() cos_sim = CosineSimilarity(dim=1) cosine_similarity = torch.Tensor([]) true_similarity = torch.Tensor([]) if not test: for i, (x, y) in enumerate(data_loader): img1 = x[0].to(device) img2 = x[1].to(device) y = y.to("cpu") out1 = model(img1).to("cpu") out2 = model(img2).to("cpu") cosine_similarity = torch.cat( (cosine_similarity.detach(), cos_sim(out1, out2).detach()), 0) true_similarity = torch.cat((true_similarity, y), 0) del x, y, img1, img2, out1, out2 torch.cuda.empty_cache() if i % 10 == 0: print("Verification on validation set:", i * batchsize, end='\r') AUC = roc_auc_score(true_similarity, cosine_similarity.detach().numpy()) return AUC else: for i, (x) in enumerate(data_loader): img1 = x[0].to(device) img2 = x[1].to(device) out1 = model(img1).to("cpu") out2 = model(img2).to("cpu") cosine_similarity = torch.cat( (cosine_similarity.detach(), cos_sim(out1, out2).detach()), 0) if i % 10 == 0: print("Verification on test set:", i * batchsize, end='\r') return write_submission(sub_name, cosine_similarity)
def get_related(token: str, embedding: Model, vocab: Vocabulary, num_related: int = 20): """Given a token, return a list of top 20 most similar words to the token.""" token_id = vocab.get_token_index(token, 'token_in') token_vec = embedding.weight[ token_id] #A pre-initialization weight matrix for the embedding lookup, allowing the use of pretrained vectors. cosine = CosineSimilarity( dim=0 ) #we do this to be able calculate simple cosine similarity between 2 vectors sims = Counter() for index, token in vocab.get_index_to_token_vocabulary( 'token_in').items(): # Cosine similarity of our token vector with every other word vector in the vocabulary sim = cosine(token_vec, embedding.weight[index]).item() sims[token] = sim #save the value of cosine similarity return sims.most_common(num_related)
def get_eigenvector_decomposition_magnitude_indv(eigenvectors, eigenvalues, X, correction_mean): ''' Mean average of magnitude of cosine distance to each eigenevector ''' whitened_cos_dists = [] ranks = [] with torch.no_grad(): # Correct by pre-calculated authentic data mean X = X - correction_mean.repeat(X.size(0), 1) cos = CosineSimilarity(dim=1) for i in range(eigenvectors.size(0)): ranks.append(i) v = eigenvectors[i] v_repeat = v.repeat(X.size(0), 1) abs_cos_dist = torch.abs(cos(X, v_repeat)) whitened_abs_cos_dist = abs_cos_dist / (eigenvalues[i]**0.5) whitened_cos_dists.append(whitened_abs_cos_dist) whitened_cos_dists = torch.stack(whitened_cos_dists, dim=1) return ranks, whitened_cos_dists
def predict(model, test_feature_path, enrollment_path, test_path, threshold): model.cpu() _, grp_embeddings = enrollment(model, test_feature_path, enrollment_path) indices, _ = readTestPaths(test_path) # 组编号-文件名列表 cosine_similarity = CosineSimilarity(dim=1) cos_similarity = {} for key, value in indices.items(): # 组编号-文件名列表 for path in value: out = calculateOneEmbedding(model, test_feature_path, path) cosine = cosine_similarity(out, grp_embeddings[key]) cos_similarity[path] = max(cosine).item() groupid = [] fileid = [] ismember = [] results = {} for idx, (k, v) in enumerate(cos_similarity.items()): groupid.append(idx // 100) fileid.append(k) ismember.append('Y') if v > threshold else ismember.append('N') results['GroupID'] = groupid results['FileID'] = fileid results['IsMember'] = ismember results = pd.DataFrame(results) results.to_csv('results.csv', index=False)
def evaluate_embeddings(embedding, vocab: Vocabulary): cosine = CosineSimilarity(dim=0) simlex999 = read_simlex999() sims_pred = [] oov_count = 0 for word1, word2, sim in simlex999: word1_id = vocab.get_token_index( word1, 'token_in') #word1_id takes the ID of the word 1. if word1_id == 1: # word_ID==1 means that that the word is out of vocabulary OOV sims_pred.append(0.) oov_count += 1 continue word2_id = vocab.get_token_index( word2, 'token_in') #word2_id takes the ID of the word 2 if word2_id == 1: sims_pred.append(0.) oov_count += 1 continue sim_pred = cosine( embedding.weight[word1_id], embedding.weight[word2_id] ).item( ) #Calculate the CosineSimilarity between word1 and word2 and charge this in sim_pred. sims_pred.append(sim_pred) assert len(sims_pred) == len( simlex999 ) # Assertion de l'egalité de longueur de sims_pred et simlex999 print('# of OOV words: {} / {}'.format(oov_count, len(simlex999))) print(pearsonr(sims_pred, [sim for _, _, sim in simlex999])) return spearmanr( sims_pred, [sim for _, _, sim in simlex999] ) # compare two sets of similarities and calculate how they are related, it's called spearman's correlation #compare two sets of similarities and calculate how they are related. #Calculates a Spearman rank-order correlation coefficient and the p-value to test for non-correlation. """scipy.stats.spearmanr(a, b=None, axis=0)[source]
import numpy as np from itertools import islice from collections import deque import matplotlib import umap import matplotlib.pyplot as plt import seaborn as sns from mpl_toolkits.mplot3d import proj3d import matplotlib.cm as cm from torch.nn import CosineSimilarity from sty import fg, bg, ef, rs, RgbFg from sklearn.preprocessing import MinMaxScaler import syntok.segmenter as segmenter document = Document() ## Create a python-docx document cos = CosineSimilarity(dim=1, eps=1e-6) sent_level = False dynamic = True graph = False doc_embeddings = [] scores = [] stacked_embeddings = DocumentPoolEmbeddings([ WordEmbeddings('en'), #WordEmbeddings('glove'), #WordEmbeddings('extvec'),#ELMoEmbeddings('original'), #BertEmbeddings('bert-base-cased'), #FlairEmbeddings('news-forward-fast'), #FlairEmbeddings('news-backward-fast'), #OpenAIGPTEmbeddings()
def predict(model, eval_dataloader, output_dir, eval_fearures, args, cur_train_mean_loss=None, logger=None, compute_metrics=True, eval_script_path='../MeasEval/eval/measeval-eval.py', only_parts=''): only_parts = [part for part in only_parts.split('+') if part] model.eval() syns = sorted(model.local_config['syns']) device = torch.device( 'cuda') if model.local_config['use_cuda'] else torch.device('cpu') metrics = defaultdict(float) nb_eval_steps = 0 syns_preds = [] for batch_id, batch in enumerate( tqdm(eval_dataloader, total=len(eval_dataloader), desc='validation ... ')): batch = tuple([elem.to(device) for elem in batch]) input_ids, input_mask, token_type_ids, b_syn_labels, b_positions = batch with torch.no_grad(): loss, syn_logits = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=input_mask, input_labels={ 'syn_labels': b_syn_labels, 'positions': b_positions }) if compute_metrics: for key, value in loss.items(): metrics[f'eval_{key}_loss'] += value.mean().item() nb_eval_steps += 1 if model.local_config['loss'] != 'cosine_similarity': syns_preds.append(syn_logits.detach().cpu().numpy()) else: syns_preds.append(CosineSimilarity()( syn_logits[0], syn_logits[1]).detach().cpu().numpy()) syns_scores = np.concatenate(syns_preds, axis=0) # n_examples x 2 or n_examples if syns_scores.shape[-1] != 1 and model.local_config[ 'loss'] != 'cosine_similarity': syns_preds = np.argmax(syns_scores, axis=1) # n_examples elif model.local_config['loss'] == 'cosine_similarity': syns_preds = np.zeros(syns_scores.shape, dtype=int) syns_preds[syns_scores >= 0.5] = 1 else: syns_preds = np.zeros(syns_scores.shape, dtype=int) if model.local_config['train_scd']: syns_preds[syns_scores >= 3.0] = 1 else: syns_preds[syns_scores > 0.5] = 1 predictions = defaultdict(lambda: defaultdict(list)) golds = defaultdict(lambda: defaultdict(list)) scores = defaultdict(lambda: defaultdict(list)) gold_scores = defaultdict(lambda: defaultdict(list)) lemmas = defaultdict(lambda: defaultdict(list)) syn_ids_to_label = {0: 'F', 1: 'T'} for ex_id, (ex_feature, ex_syn_preds, ex_scores) in enumerate( zip(eval_fearures, syns_preds, syns_scores)): example = ex_feature.example docId = example.docId posInDoc = int(docId.split('.')[-1]) docId = '.'.join(docId.split('.')[:-1]) syn_pred = syn_ids_to_label[ex_syn_preds.item()] predictions[docId][posInDoc].append(syn_pred) golds[docId][posInDoc].append(example.label) # scores for positive class if model.local_config['loss'] == 'cosine_similarity': scores[docId][posInDoc].append(ex_scores) elif len(ex_scores) > 1: scores[docId][posInDoc].append(softmax(ex_scores)[-1]) else: scores[docId][posInDoc].append(ex_scores[0]) gold_scores[docId][posInDoc].append(example.score) lemmas[docId][posInDoc].append((example.lemma, example.grp)) if os.path.exists(output_dir): os.system(f'rm -r {output_dir}/*') else: os.makedirs(output_dir, exist_ok=True) print(f'saving predictions for: {only_parts}') for docId, doc_preds in predictions.items(): doc_scores = scores[docId] if len(only_parts) > 0 and all([ f'{docId.split(".")[1]}.score' not in part for part in only_parts ]): continue print(f'saving predictions for part: {docId}') prediction = [{ 'id': f'{docId}.{pos}', 'tag': 'F' if 'F' in doc_preds[pos] else 'T' } for pos in sorted(doc_preds)] prediction_file = os.path.join(output_dir, docId) json.dump(prediction, open(prediction_file, 'w')) prediction = [{ 'id': f'{docId}.{pos}', 'score': [str(x) for x in doc_scores[pos]] } for pos in sorted(doc_preds)] prediction_file = os.path.join(output_dir, f'{docId}.scores') json.dump(prediction, open(prediction_file, 'w')) if compute_metrics: for key in metrics: metrics[key] /= nb_eval_steps mean_non_english = [] for docId, doc_preds in predictions.items(): if 'scd' in docId: doc_golds = gold_scores[docId] doc_lemmas = lemmas[docId] doc_scores = scores[docId] keys = sorted(list(doc_golds.keys())) # print(doc_lemmas) unique_lemmas = sorted( set([ doc_lemmas[key][0][0] for key in keys if doc_lemmas[key][0][1] == 'COMPARE' ])) y_true, y_pred = [], [] y_sent_true, y_sent_pred = [], [] for unique_lemma in unique_lemmas: unique_lemma_keys = [ key for key in keys if doc_lemmas[key][0][0] == unique_lemma and doc_lemmas[key][0][1] == 'COMPARE' ] unique_word_scores_pred = [ np.array(doc_scores[key]).mean() for key in unique_lemma_keys ] unique_word_scores_true = [ doc_golds[key][0] for key in unique_lemma_keys ] y_true.append(np.array(unique_word_scores_true).mean()) y_pred.append(np.array(unique_word_scores_pred).mean()) y_sent_true.extend(unique_word_scores_true) y_sent_pred.extend(unique_word_scores_pred) # print(y_true, y_pred) # metrics[f'spearman.{docId}.score'], _ = spearmanr(y_true, y_pred) # metrics[f'spearman.{docId}.pairwise'], _ = spearmanr(y_sent_true, y_sent_pred) metrics[f'spearman.{docId}.wordwise.score'], _ = spearmanr( y_true, y_pred) metrics[f'spearman.{docId}.score'], _ = spearmanr( y_sent_true, y_sent_pred) doc_golds = golds[docId] keys = list(doc_golds.keys()) doc_golds = [doc_golds[key][0] for key in keys] doc_preds = [ 'F' if 'F' in doc_preds[key] else 'T' for key in keys ] metrics[f'{docId}.accuracy'] = accuracy_score( doc_golds, doc_preds) else: doc_golds = golds[docId] keys = list(doc_golds.keys()) doc_golds = [doc_golds[key][0] for key in keys] doc_preds = [ 'F' if 'F' in doc_preds[key] else 'T' for key in keys ] metrics[f'accuracy.{docId}.score'] = accuracy_score( doc_golds, doc_preds) if 'en-en' not in docId: mean_non_english.append(metrics[f'accuracy.{docId}.score']) if mean_non_english: metrics[f'accuracy.{docId.split(".")[0]}.nen-nen.score'] = sum( mean_non_english) / len(mean_non_english) if cur_train_mean_loss is not None: metrics.update(cur_train_mean_loss) else: metrics = {} model.train() return metrics