def do_prepare(self, params, prepare): if similarity in params: self.similarity = similarity else: # Default similarity is cosine self.similarity = lambda s1, s2: cosine(np.nan_to_num(s1), np.nan_to_num(s2)) return prepare(params, self.samples)
def run(self, batcher, params): results = {} for dataset in self.datasets: sys_scores = [] input1, input2, gs_scores = self.data[dataset] for ii in range(0, len(gs_scores), params.batch_size): batch1 = input1[ii:ii + params.batch_size] batch2 = input2[ii:ii + params.batch_size] # we assume that the get_batch function already throws out the faulty ones if len(batch1) == len(batch2) and len(batch1) > 0: enc1 = batcher(batch1, params) enc2 = batcher(batch2, params) for kk in range(enc2.shape[0]): sys_score = cosine(np.nan_to_num(enc1[kk]), np.nan_to_num(enc2[kk])) sys_scores.append(sys_score) results[dataset] = {'pearson': pearsonr(sys_scores, gs_scores), 'spearman': spearmanr(sys_scores, gs_scores)} logging.debug('{0} : pearson = {1}, spearman = {2}'.format(dataset, results[dataset]['pearson'], results[dataset]['spearman'])) avg_pearson = np.mean([results[dset]['pearson'][0] for dset in results.keys()]) avg_spearman = np.mean([results[dset]['spearman'][0] for dset in results.keys()]) results['all'] = {'pearson': avg_pearson, 'spearman': avg_spearman} logging.debug('Results (all) : Pearson = {0}, Spearman = {1}\n'.format(results['all']['pearson'], results['all']['spearman'])) return results
def scoring_fn(i, j): """ Calculate how similar the entities are :param i: triple index :param j: triple index :return: cosine similarity """ # Get verb entities ents = (marg[i], marg[j]) # Get cosine similarity return cosine(*ents)
def getdistinctivefeatures(lang1, lang2, phonemeMap): """ Contrast this with getF1. I can't get this to work correctly. :param lang1: a set of Phonemes :param lang2: a set of Phonemes :return: the Distinctive Features score for these languages. """ if len(lang1) == 0: print "ERROR: first lang is empty or doesn't exist" return -1 if len(lang2) == 0: print "ERROR: second lang is empty or doesn't exist" return -1 # loop over all pairs. scores = {} total = 0 for p in lang1: # get closest in lang2 maxsim = 0 # just a small number... maxp = None # max phoneme associate with maxsim for p2 in lang2: pu1 = p.Phoneme pu2 = p2.Phoneme if pu1 in phonemeMap and pu2 in phonemeMap: ps = tuple(sorted([pu1, pu2])) if ps in phonedist: sim = phonedist[ps] else: sim = 1-utils.cosine(phonemeMap[pu1], phonemeMap[pu2]) phonedist[ps] = sim else: # not there...? #print "SHOULD NEVER HAPPEN!", #if pu1 not in phonemeMap: # print "missing ", pu1 #if pu2 not in phonemeMap: #print "missing ", pu2 sim = 0 scores[(pu1,pu2)] = sim total += sim total /= float(len(lang1) * len(lang2)) return total
def score_fn(term, description, verb_weight=1, head_weight=1): "Score using vector addition and cosine similarity" which, triple = description if which == 'SBJ': weights = [verb_weight, head_weight, 1] elif which == 'OBJ': weights = [verb_weight, 1, head_weight] else: raise ValueError(which) combined = sum(vec[token] * wei for token, wei in zip(triple, weights)) target = vec[term] return cosine(target, combined)
def compare(lang1, lang2): """ Given two language names, get distance according to phonology scores. :param lang1: name of first lang (eg English) :param lang2: name of second lang :return: the distance of the languages, or -1 if one or both langs not found. """ l1, l2 = comparefeats(lang1, lang2) if l1 and l2: return utils.cosine(l1.phon_feats(), l2.phon_feats()) else: print "One or both langs not found: {0}, {1}".format(l1, l2) return -1
def scoring_fn(i, j): "Add triples and take cosine" # Get tokens v1 = raw_triples[i][0] v2, s, o = raw_triples[j] if with_lookup: v1 = pred_name[lookup['v'][v1]] v2 = pred_name[lookup['v'][v2]] s = pred_name[lookup['n'][s]] o = pred_name[lookup['n'][o]] # Lookup vectors v1, v2, s, o = [vec[model.vocab[x].index] for x in [v1, v2, s, o]] comp1 = v1 + s + o comp2 = v2 + s + o return cosine(comp1, comp2)
def calculate_distance_offset(pairs, df, offset, offset2=None): # CONVERTED TO calculate_distance_mode """ From a set of pairs, calculate the distance of the offset of the pair in relation to the global offset. Parameters: ----------- pairs: list list containing tuples of IDs of norms [(id1,id2),(id3,id4)...] df: pandas.dataframe dataframe containing ids and embeddings of sentences offset: np.array vector containing the global offset (offset of all conflicts) """ label = 0 vdist = [] pb = progressbar.ProgressBar(len(pairs)) for i, arr in enumerate(pairs): emb1 = df.id2embed(arr[0]) emb2 = df.id2embed(arr[1]) local_offset = emb1 - emb2 # cosine (similar:0->2:not_similar) cos = utils.cosine(local_offset, offset) # euclidean distance (similar:0->inf:not_similar) euc = utils.euclidean(local_offset, offset) if len(offset2) > 0: cos2 = utils.cosine(local_offset, offset2) euc2 = utils.euclidean(local_offset, offset2) vdist.append([cos, euc, cos2, euc2]) else: vdist.append((cos, euc)) pb.update() #if i == 1000: break return vdist
def compare(lang1, lang2): """ Given two language names, get distance according to phonology scores. :param lang1: name of first lang (eg English) :param lang2: name of second lang :return: the distance of the languages, or -1 if one or both langs not found. """ l1, l2 = comparefeats(lang1, lang2) if l1 and l2: return utils.cosine(l1.phon_feats(),l2.phon_feats()) else: print "One or both langs not found: {0}, {1}".format(l1, l2) return -1
def run(self, batcher, params): results = {} for dataset in self.datasets: sys_scores = [] input1, input2, gs_scores = self.data[dataset] for ii in range(0, len(gs_scores), params.batch_size): batch1 = input1[ii:ii + params.batch_size] batch2 = input2[ii:ii + params.batch_size] # we assume that the get_batch function already throws out the faulty ones if len(batch1) == len(batch2) and len(batch1) > 0: enc1 = batcher(batch1, params) enc2 = batcher(batch2, params) for kk in range(enc2.shape[0]): sys_score = cosine(np.nan_to_num(enc1[kk]), np.nan_to_num(enc2[kk])) sys_scores.append(sys_score) results[dataset] = {'pearson': pearsonr(sys_scores, gs_scores), 'spearman': spearmanr(sys_scores, gs_scores),\ 'nsamples': len(sys_scores)} logging.debug('%s : pearson = %.4f, spearman = %.4f' % (dataset, results[dataset]['pearson'][0], results[dataset]['spearman'][0])) weights = [results[dset]['nsamples'] for dset in results.keys()] list_prs = np.array( [results[dset]['pearson'][0] for dset in results.keys()]) list_spr = np.array( [results[dset]['spearman'][0] for dset in results.keys()]) avg_pearson = np.average(list_prs) avg_spearman = np.average(list_spr) wavg_pearson = np.average(list_prs, weights=weights) wavg_spearman = np.average(list_spr, weights=weights) results['all'] = {'pearson': {'mean':avg_pearson, 'wmean':wavg_pearson},\ 'spearman': {'mean':avg_spearman, 'wmean':wavg_spearman}} logging.debug( 'ALL (weighted average) : Pearson = %.4f, Spearman = %.4f' % (wavg_pearson, wavg_spearman)) logging.debug('ALL (average) : Pearson = %.4f, Spearman = %.4f\n' % (avg_pearson, avg_spearman)) return results
def neighbours(self, word, size = 10): """ Get nearest words with KDTree, ranking by cosine distance """ word = word.strip() v = self.word_vec(word) [distances], [points] = self.kdt.query(array([v]), k = size, return_distance = True) assert len(distances) == len(points), "distances and points should be in same shape." words, scores = [], {} for (x,y) in zip(points, distances): w = self.index2word[x] if w == word: s = 1.0 else: s = utils.cosine(v, self.syn0[x]) if s < 0: s = abs(s) words.append(w) scores[w] = min(s, 1.0) for x in sorted(words, key=scores.get, reverse=True): yield x, scores[x]
def calc_word_similarity(self, test_file, embed_vec): """Calculate Word Similarity Arguments: test_file {str} -- similarity test file e.g wordsim-240 and wordsim-297 embed_vec {Keyedvectors} -- A pre-load gensim word vectors """ pred, label, found = [], [], 0 with open(test_file, 'r', encoding='utf8') as fr: lines = fr.readlines() for line in lines: w1, w2, score = line.split() if w1 in embed_vec and w2 in embed_vec: found += 1 pred.append(cosine(embed_vec[w1], embed_vec[w2])) label.append(float(score)) file_name = test_file.split("/")[-1].replace('.txt', '') print(f"Test File: {file_name}") print(f"Numbers of words Found: {found}") print(f"Numbers of words Not Found: {len(lines) - found}") print(f"Spearman's Rank Coeficient: {rho(label, pred)}")
# construction of the query vector parsed_dir = "C:\\Users\\nbonardo\\movies_parsed" n_files = len([name for name in os.listdir(parsed_dir) if os.path.isfile(os.path.join(parsed_dir, name))]) queryVector = dict() nWordsQ = len(term_ids) for term_id in term_ids: term_idf = 1 + math.log(n_files / len(idx[str(term_id)])) queryVector[term_id] = term_idf / nWordsQ #print("Query vector", queryVector) # calculation of the cosine similarity between each document vector of the result and the quere vector # storing the result (as tuple) in a heap structure h = [] for doc in final: cos = utils.cosine(queryVector, docVectors[doc]) #print("Cosine similarity of doc", doc, "=", cos, utils.getMovieTitle(doc)) heapq.heappush(h, (cos, doc)) # output of the top-K movies according to cosine similarity K = 5 K = min(K, len(final)) print("*** TOP-", K, " results ***") print(" Id | Title | Cosine Similarity") print("-----+------------------------------------------+------------------") for _ in range(K): movieTup = heapq.heappop(h) #print(movieTup[1], utils.getMovieTitle(movieTup[1]), movieTup[0]) print("%4s | %40s | %.6s" % (movieTup[1]+1, utils.getMovieTitle(movieTup[1]), movieTup[0])) elif search_engine == '3':
def relevance(self, title, content, sentences): """计算各个句子和标题以及正文的相关性""" c = [] for s in sentences: c.append(self.weight * cosine(s, content) + (1 - self.weight) * cosine(s, title)) return c
# F6: Proper Noun for Si in sents: Si_propnouns = np.intersect1d(Si, propernoun) F6.append(len(Si_propnouns) / len(Si)) # F7: Similarities Between Sentences vocab = sorted(set(flat)) TF = get_TF(sents, vocab) sim_SiSj = [] for i, Si in enumerate(TF): temp = [] for j, Sj in enumerate(TF): if i == j: continue temp.append(cosine(Si, Sj)) sim_SiSj.append(sum(temp)) max_simSiSj = max(sim_SiSj) for sim_Si in sim_SiSj: F7.append(sim_Si / max_simSiSj) # F8: Term Weight TFIDF = get_TFIDF(sents, vocab) sum_TFIDF = [] for tfidf in TFIDF: sum_TFIDF.append(sum(tfidf)) max_sum_TFIDF = max(sum_TFIDF) for sum_tfidf in sum_TFIDF:
ids = torch.tensor(query_tokens['input_ids']).unsqueeze(0) mask = torch.tensor(query_tokens['attention_mask']).unsqueeze(0) pred = classifier(ids, mask) top_val, top_idx = torch.topk(pred[0], 3, dim=1) pred_categories = model_classes[top_idx].tolist()[0] topics = [cat_map[cat] for cat in pred_categories] ### Encode query to embedding and return top item # encode user query query_embedding = sentence_transformer.encode(query) # filter df to relevant categories and grab embeddings relevant_docs = df[df['categories'].isin(pred_categories)] relevant_embeddings = embeddings[relevant_docs.index] # Calculate cosine similarity of user query and relevant embeddings sims = [] for doc in relevant_embeddings: doc_sim = cosine(query_embedding, doc) sims.append(doc_sim) top_matches = np.argsort(sims)[::-1] top_item = relevant_docs.iloc[top_matches[0]] print(f'User Query: {query}\n') print(f'Predicted Topics {topics}\n') print(f'Recomended Paper:\n {top_item.title}\n') print(f'Abstract:\n{top_item.abstract}\n')
def compute_loss(self, input_image, pca_render, gcn_render, pca_texture, gcn_texture, proj_color, pca_color, gcn_color, input_feat, gcn_feat, regularization, get_inter=False): """Adds to the inference model the layers required to generate loss.""" with tf.name_scope('loss'): with tf.name_scope('data_loss'): skin_mask = self._erosion2d(input_image[..., 3:]) gcn_render_mask = tf.round(gcn_render[..., 3:]) * skin_mask # pca_render_loss = tf.losses.mean_squared_error( pca_render_loss = tf.losses.absolute_difference( predictions=pca_render[..., :3] * gcn_render_mask, labels=input_image[..., :3] * gcn_render_mask, reduction=tf.losses.Reduction.SUM) / tf.reduce_sum(gcn_render_mask) # gcn_render_loss = tf.losses.mean_squared_error( gcn_render_loss = tf.losses.absolute_difference( predictions=gcn_render[..., :3] * gcn_render_mask, labels=input_image[..., :3] * gcn_render_mask, reduction=tf.losses.Reduction.SUM) / tf.reduce_sum(gcn_render_mask) # project_loss_image = tf.losses.mean_squared_error( project_loss_image = tf.losses.absolute_difference( predictions=gcn_color * proj_color[..., 3:], labels=proj_color[..., :3] * proj_color[..., 3:], reduction=tf.losses.Reduction.MEAN) # project_loss_pca = tf.losses.mean_squared_error( project_loss_pca = tf.losses.absolute_difference( predictions=gcn_color * (1 - proj_color[..., 3:]), labels=pca_color * (1 - proj_color[..., 3:]), reduction=tf.losses.Reduction.MEAN) project_loss = project_loss_image + 0.3 * project_loss_pca # refine_loss = tf.losses.mean_squared_error( refine_loss = tf.losses.absolute_difference(predictions=gcn_texture, labels=pca_texture, reduction=tf.losses.Reduction.MEAN) perception_loss = 1 - tf.reduce_mean(utils.cosine(input_feat, gcn_feat)) var_losses = [] gcn_skin_texture = tf.gather(gcn_texture, self.bfm.skin_index, axis=1) for i in range(3): _, variance = tf.nn.moments(gcn_skin_texture[..., i], axes=1) var_losses.append(variance) var_loss = tf.reduce_mean(var_losses) sym_diff = tf.gather(gcn_texture, self.bfm.left_index, axis=1) - tf.gather( gcn_texture, self.bfm.right_index, axis=1) sym_loss = tf.reduce_mean(tf.sqrt(tf.reduce_sum(tf.square(sym_diff) + 1e-16, axis=-1))) # adj_tensor = tf.constant(self.adjacent.reshape( # [1, self.num_vert, self.num_vert, 1]), # dtype=tf.int32, # shape=[1, self.num_vert, self.num_vert, 1]) # coo = self.adjacent.tocoo() # indices = np.mat([0, self.adjacent.row, self.adjacent.col, 0]).transpose() # values = np.ones_like(self.adjacent.data, np.float32) # adj_tensor = tf.SparseTensor(indices, values, self.adjacent.shape) # # adj_tensor = tf.SparseTensor(self.adjacent.indices, # # np.clip(self.adjacent.data, 0, 1), # # self.adjacent.shape) # expand = tf.ones([1, self.num_vert, self.num_vert, 3], dtype=tf.float32) # expand = expand * tf.expand_dims(gcn_texture, axis=1) # exp_trans = tf.transpose(expand, [0, 2, 1, 3]) # # vertical = tf.ones([self.num_vert, self.num_vert, 3], dtype=tf.float32) # # vertical = vertical * tf.expand_dims(gcn_texture, axis=2) # smooth_loss = tf.abs((expand - exp_trans) * adj_tensor) # test = tf.sparse_to_dense(smooth_loss.indices, ) #TODO: need attention # data_loss = self.ph_ref_lambda * refine_loss + self.ph_ren_lambda * ( # gcn_render_loss + 0.2 * project_loss + # 0.2 * perception_loss) + 0.1 * sym_loss data_loss = self.ph_ref_lambda * refine_loss + self.ph_ren_lambda * ( project_loss + 0.2 * perception_loss + 0.5 * sym_loss + 0.01 * var_loss) # if not get_inter: # self.skin_mask = skin_mask # self.gcn_render_mask = gcn_render_mask # self.gcn_render_image = gcn_render[..., :3] # self.input_image_rgb = input_image[..., :3] # self.pca_render_image = pca_render[..., :3] with tf.name_scope('regularization'): regularization *= tf.add_n(self.regularizers) loss = data_loss + regularization tf.summary.scalar('loss/data_loss', data_loss) tf.summary.scalar('loss/pca_render_loss', pca_render_loss) tf.summary.scalar('loss/gcn_render_loss', gcn_render_loss) tf.summary.scalar('loss/project_loss', project_loss) tf.summary.scalar('loss/refine_loss', refine_loss) tf.summary.scalar('loss/perception_loss', perception_loss) tf.summary.scalar('loss/var_loss', var_loss) tf.summary.scalar('loss/sym_loss', sym_loss) tf.summary.scalar('loss/regularization', regularization) logger.info('Successfully Computed Losses') return loss, pca_render_loss, gcn_render_loss, project_loss, refine_loss, perception_loss, var_loss, sym_loss
def sim(a, b): "Cosine similarity of vectors" return cosine(vec[a], vec[b])