def show_similar(embeds, labels, n_examples=10, n_nearby=6): # Gather a random set of queries (sentences embeddings that we'll compare against) query_label_idx = random.randint(0, len(labels) - n_examples) query_vectors = embeds[query_label_idx:query_label_idx + n_examples, :] # Find indices for the embeddings that are nearest to the queries t = time.time() indices, dists = _find_nearest(query_vectors, embeds, n_nearby, batch_size=1000000) print indices.shape, dists.shape t = time.time() - t query_sentences = labels[query_label_idx:query_label_idx + dists.shape[0]] for query_num, query_sentence in enumerate(query_sentences): print "*******************************************************************" print query_sentence dist_row = dists[query_num, :] index_row = indices[query_num, :] for dist, idx in zip(dist_row, index_row): print dist, labels[idx] print 'Took {} seconds ({} s/per query)'.format(t, t / n_examples) return # TODO: gensim is much faster but doesn't handle duplicate entries very well, and goes OOM kv = KeyedVectors(embeds.shape[-1]) kv.add(labels, embeds) random_labels = random.sample(labels, 10) for label in random_labels: print label for tup in kv.most_similar(label): print tup print '--------------------------------------------------'
def is_correct(kv: KeyedVectors, analogy: Analogy, at: int = 1) -> bool: pos = analogy[0].split('/') + analogy[2].split('/') neg = analogy[1].split('/') tgt = analogy[3].split('/') sim_words = set( w for w, _ in kv.most_similar(positive=pos, negative=neg, topn=at)) return any(t in sim_words for t in tgt)
def plot_word_combined(word: str, embedding: KeyedVectors, pca_model: PCA, modifiers: [str]): res = [] for i, mod in enumerate(modifiers): words_vectors = {} words_vectors[word + mod] = embedding[word + mod] similar = embedding.most_similar(positive=[word + mod], topn=10) for sim_word, _ in similar: words_vectors[sim_word] = embedding[sim_word] res.append(words_vectors) # Apply dimensionality reduction before plotting # not in use anymore because it didn't help much when analyzing the outputs # Instead we just produce an output string #words_vectors_dim_reduced = {word: pca_model.transform(words_vectors[word].reshape(1, -1)) for word in words_vectors} res_string = "" for i, dct in enumerate(res): res_string += "Word:" + word + modifiers[i] res_string += str(list(dct.keys())) res_string += '\n' return res_string
elif (args.entities == 'outin' and args.elentities == 'inout' or args.entities == 'inout' and args.elentities == 'outin'): entity_vec = entityv.word_vec(entity, use_norm=args.norm) positive.append( np.concatenate( (entity_vec[model.vector_size:], entity_vec[:model.vector_size])) * score) else: raise Exception("Configuration is not supported") else: print( 'entity {} doesn\'t have an embedding'.format(entity)) if not positive: print( 'No vocab tokens for query {}: {}! Using zero vector for "positive".' .format(qid, ' '.join(qtokens))) positive.append(np.zeros(entityv.vector_size)) for i, (entity, score) in enumerate( entityv.most_similar(positive=positive, topn=1000)): print(qid, 'Q0', entity, i + 1, score, 'kewer', sep=' ', file=out_file)
def most_similar(expr, wv: KeyedVectors): return wv.most_similar(parse(expr), topn=3)
lower_cas.append(i) # print(lower_cas) for i in vip: lower_cas.append(i) # print(lower_cas) lower_ca = [] for i in lower_cas: pattern = re.compile(r'\d[:/]\d[:/]\d\d\d\d') matches = pattern.finditer(i) count = 0 for match in matches: count += 1 if count == 0: lower_ca.append(i) final_lst = [] final_lst.append(lower_ca) print(final_lst) model = Word2Vec(final_lst, size=1, window=5, min_count=1, workers=4) word_vectors = model.wv fname = get_tmpfile("vectors.kv") # word_vectors.save(fname) # print(model.predict_output_word(['dsds'])) # print(model.most_similar('issu')) # tok=['dsds','gsd'] word_vectors = KeyedVectors.load(fname, mmap='r') # print(KeyedVectors.most_similar_to_given(self=word_vectors, entity1=tok,entities_list=['PriSize Estimation', 'Other Tools', 'AVMCommonPortal_L2'])) print(KeyedVectors.most_similar(self=word_vectors, positive=['sender']))
# text += print_info_length(corpus_labels, lines_corpus_splitted, "corpus docs" + conf, "words", True) text += print_info_length(queries_labels, lines_queries_splitted, "queries" + conf, "words", True) text += '\n' + str(corpus_model) print("done.") w1 = "night" outv = KeyedVectors(300) outv.vocab = corpus_model.wv.vocab # same outv.index2word = corpus_model.wv.index2word # same outv.syn0 = corpus_model.syn1neg # different text += '\nIN EMBEDDINGS COMPARISON:\n' + str( corpus_model.wv.most_similar(positive=[corpus_model[w1]], topn=6)) print("IN-IN done.") text += '\nOUT EMBEDDINGS COMPARISON:\n' + str( outv.most_similar(positive=[outv[w1]], topn=6)) print("OUT-OUT done.") text += '\nIN-OUT EMBEDDINGS COMPARISON:\n' + str( corpus_model.wv.most_similar(positive=[outv[w1]], topn=6)) print("IN-OUT done.") text += '\nOUT-IN EMBEDDINGS COMPARISON:\n' + str( outv.most_similar(positive=[corpus_model[w1]], topn=6)) print("OUT-IN done.") with open("data_analysis/data_analysis" + conf + ".txt", 'w') as file: file.write(text)
elif not (entity.startswith('entity:') or entity.startswith('relation:')): wordv_entities.append(entity) wordv_weights.append(embedding) print('entities:', entityv_entities[:4]) print('words:', wordv_entities[:4]) entityv = KeyedVectors(entityv_weights[0].shape[0]) entityv.add(entityv_entities, entityv_weights) wordv = KeyedVectors(wordv_weights[0].shape[0]) wordv.add(wordv_entities, wordv_weights) wordv.init_sims() print(entityv.most_similar(positive=[wordv['detroit']])) with open(args.outfile, 'w') as out_file: for qid, qtokens in queries.items(): if args.el: if args.elremove: for entity in qid_entities[qid]['entities']: if '<{}>'.format(entity) in entityv: qtokens = list( set(qtokens) - set(qid_entities[qid]['surface_tokens'][entity])) else: print( 'not removing tokens for entity {} because it doesn\'t have an embedding' .format(entity)) elif args.elremoveall and qid_entities[qid]['entities']:
def cluster_balls( model: KeyedVectors, root: str = None, max_size: int = None, min_score: float = None, ): """ Cluster a model's keys by applying a revisited Radial Ball Mapper algorithm. A root key should be specified in case a point of interest is known. Not specifying any root key, a random one is picked from the model. If no otherwise specified, a `max_size` of 30 is used by default. if no otherwise specified, a `min_score` calculated as mean of all best similarities, minus a gap of 0.05, is used by default. Parameters ---------- model : KeyedVectors Word2Vec model which stores all keys and vectors. root : str Point of interest from which to start clustering balls, by default None. max_size : int, optional Maximum size of a ball in terms of number of keys, by default None. min_score : float, optional Minimum similarity threshold for starting a cluster, by default None. Returns ------- List[List[str]] Clusters of keys """ if root is None: rand_i = randrange(0, len(model.index_to_key)) root = model.index_to_key[rand_i] elif root not in model: return max_size = max_size or 30 neighs = model.most_similar(root, topn=max_size) if not neighs: return if min_score is None: mean = _get_neighs_mean_score(model, neighs) min_score = min(neighs[0][1], mean - 0.10) clusters = [] root_cluster = {root} seen = {root: (root_cluster, 1)} for n, s in neighs: if n in seen: continue if s >= min_score: root_cluster.add(n) seen.setdefault(n, (root_cluster, s)) continue cluster = set() min_sub_score = min_score + 0.10 for nn, ss in model.most_similar(n, topn=max_size): if nn in seen: c, b = seen[nn] if c == root_cluster or b >= ss: continue if ss >= min_sub_score: if nn in seen: prev_cluster = seen[nn][0] prev_cluster.remove(nn) cluster.add(nn) seen[nn] = (cluster, ss) cluster.add(n) seen.setdefault(n, (cluster, 1)) clusters.append(cluster) if len(cluster) < 3: continue intruder = _get_intruder(model, cluster) if intruder is None: continue del seen[intruder] cluster.remove(intruder) clusters.insert(0, root_cluster) return clusters
class PoemsModel: morph_analyzer = pymorphy2.MorphAnalyzer() def __init__(self, poems_model_file='', w2v_file=''): self.w2v = KeyedVectors() self.poems = [] # [str, str, ...] self.bags = [] # [[str, str, ...], ...] self.vocab = {} # {word: count, ...} self.matrices = [] # [np.ndarray, ...] self.grammar_map = grammar_map_POS_TAGS if w2v_file: self.load_w2v_model(w2v_file) if poems_model_file: self.read(poems_model_file) def load_w2v_model(self, file_name: str) -> None: print("loading w2v_model...") logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) self.w2v = KeyedVectors.load_word2vec_format(file_name, binary=True, encoding='utf-8') print("word2vec model '%s' loaded" % file_name) def canonize_words(self, words: list) -> list: stop_words = ('быть', 'мой', 'наш', 'ваш', 'их', 'его', 'её', 'их', 'этот', 'тот', 'где', 'который', 'либо', 'нибудь', 'нет', 'да') normalized = [] for w in words: forms = self.morph_analyzer.parse(w.lower()) try: form = max(forms, key=lambda x: (x.score, x.methods_stack[0][2])) except Exception: form = forms[0] print(form) if not (form.tag.POS in ['PREP', 'CONJ', 'PRCL', 'NPRO', 'NUMR'] or 'Name' in form.tag or 'UNKN' in form.tag or form.normal_form in stop_words): # 'ADJF' norm_word = form.normal_form.replace("ё", "е") normalized.append(norm_word + self.grammar_map.get(form.tag.POS, '')) return normalized def semantic_associations(self, bag: list, topn=10) -> list: positive_lst = [w for w in bag if w in self.w2v.vocab] if len(positive_lst) > 0: assoc_lst = self.w2v.most_similar(positive=positive_lst, topn=topn) return [a[0] for a in assoc_lst] else: print('empty association for bag:', bag) return [] def bag_to_matrix(self, bag: list): mx = [] for i in range(len(bag)): try: mx.append(self.w2v[bag[i]]) except: pass return np.vstack(mx) if len(mx) > 0 else np.array([]) @staticmethod def read_poems(file_name: str) -> list: file = open(file_name, encoding='utf-8') lines = file.readlines() poems = [] poem = "" for line in lines: if len(line.strip()) == 0: if len(poem.strip()) > 0: poems.append(poem.lower()) poem = "" else: poem += line return poems @staticmethod def remove_punctuation(text: str) -> str: return re.sub( r""",|\.|!|\?|;|"|@|#|%|&|\*|\\|/|:|\+|-|'|\(|\)|\[|\]""", ' ', text) def make_bags(self, texts: list) -> (list, dict): bags = [] vocabulary = {} for txt in texts: bag = [] # {} clear_txt = self.remove_punctuation(txt) words = self.canonize_words(clear_txt.split()) for w in words: if w not in bag: bag.append(w) # bag[w] = bag.get(w, 0) + 1 vocabulary[w] = vocabulary.get(w, 0) + 1 bags.append(bag) return bags, vocabulary def compile(self, poems_file: str = "", w2v_file: str = "", poems_reader: Callable[[str], list] = None) -> None: if poems_file: if poems_reader is None: poems_reader = self.read_poems self.poems = poems_reader(poems_file) print('poem count:', len(self.poems)) print('making word bags...') self.bags, self.vocab = self.make_bags(self.poems) if w2v_file: self.load_w2v_model(w2v_file) print("model is compiled") def read(self, file_name: str) -> None: with open(file_name, mode='rb') as file: print('reading pickle poems model...') data = pickle.load(file) self.poems = data['poems'] self.bags = data['bags'] self.vocab = data['vocab'] print("vectorizing model...") self.matrices = [self.bag_to_matrix(bag) for bag in self.bags] print('model is loaded') def write(self, file_name: str) -> None: with open(file_name, mode='wb') as file: data = { 'poems': self.poems, 'bags': self.bags, 'vocab': self.vocab, } pickle.dump(data, file) def most_similar(self, positive="", negative="", topn=10) -> list: pos_bag = self.canonize_words(positive.split()) neg_bag = self.canonize_words(negative.split()) return self.w2v.most_similar(pos_bag, neg_bag, topn) if len(positive) > 0 else () def semantic_levels(self, base_word: str, portions=[0.2, 0.3, 0.5], vocab_count=100) -> (dict, np.ndarray): if base_word not in self.w2v.vocab: return {}, [] levels = dict() levels[0] = {base_word} for level in range(1, len(portions) + 1): levels[level] = set() matrix = [self.w2v.word_vec(base_word)] similars = self.w2v.most_similar(base_word, topn=vocab_count) notch = 0 for level, portion in enumerate(portions): next_notch = notch + int(vocab_count * portion) for i in range(notch, next_notch): word = similars[i][0] levels[level + 1].add(word) matrix.append(self.w2v.word_vec(word) * (1 - portions[level])) notch = next_notch return levels, np.vstack(matrix) @staticmethod @numba.jit def semantic_similarity_fast_log(mx1: np.ndarray, mx2: np.ndarray) -> float: return np.sum(np.dot(mx1, mx2.T)) * np.log10(mx2.size) / (mx2.size + mx1.size) \ if mx1.size > 0 and mx2.size > 0 else 0.0 @staticmethod @numba.jit def semantic_similarity_fast(mx1: np.ndarray, mx2: np.ndarray) -> float: return np.sum(np.dot(mx1, mx2.T)) / (mx2.size + mx1.size) \ if mx1.size > 0 and mx2.size > 0 else 0.0 def similar_poems_idx(self, query, topn=5) -> list: # [(poem_idx, sim)] query_mx = query if type(query) == str: clear_query = self.remove_punctuation(query) query_bag = self.canonize_words(clear_query.split()) query_mx = self.bag_to_matrix(query_bag) if len(query_mx) == 0: return [] similars = [(i, self.semantic_similarity_fast_log(query_mx, mx)) for i, mx in enumerate(self.matrices)] # similars.sort(key=lambda x: x[1], reverse=True) return heapq.nlargest(topn, similars, key=lambda x: x[1]) def similar_poems(self, query, topn=5) -> list: # [(poem, sim)] return [(self.poems[idx], sim) for idx, sim in self.similar_poems_idx(query, topn)]
class VectorSpaceModel(object): """Base class for models that represent words as vectors. For now, this really is just a wrapper around the Gensim KeyedVectors / Word2Vec class. """ def __init__(self, name=None): self.name = name self.m = KeyedVectors() return @classmethod def load(cls, filename, modelname=None, **kwargs): if filename.endswith('.pkl'): model = cls.load_pickle(filename, modelname=modelname, **kwargs) else: model = cls.load_w2v(filename, modelname=modelname, **kwargs) return model @classmethod def load_pickle(cls, filename, **kwargs): debug("Loading pickled model from file {:}".format(filename)) model = pickle.load(filename) return model @classmethod def load_w2v(cls, filename, modelname=None, **kwargs): """Load the model from disk.""" debug("Loading word2vec model from file {:}".format(filename)) if filename.endswith(".bin"): m = KeyedVectors.load_word2vec_format(filename, binary=True) else: m = KeyedVectors.load_word2vec_format(filename) model = cls() model.m = m if modelname is None: modelname = os.path.basename(filename) modelname = re.sub('.bin', '', modelname) model.name = modelname return model def save_pickle(self, filename): debug("Saving model {:} to pickle file {:}".format(self.name, filename)) pickle.dump(self, filename) return def __getitem__(self, word): return(self.m[word]) def most_similar(self, query, k=5): """Return the most similar words to the query. `query` can be either a string or a vector. If it is a string, then its vector will be looked up in the current VSM. """ if type(query) is str: results = self.m.most_similar(query, topn=k) else: results = self.m.similar_by_vector(query, topn=k) return results def __repr__(self): return "<VectorSpaceModel {:} with {:,} vectors>".format(repr(self.name), self.m.syn0.shape[0])
def get_evaluation_df(name, doc_model, hf_dataset, aspect, fold) -> Tuple[DataFrame, Dict]: # Init dataframe metrics = [ 'retrieved_docs', 'relevant_docs', 'relevant_retrieved_docs', 'precision', 'recall', 'avg_p', 'reciprocal_rank', 'ndcg' ] df = pd.DataFrame([], columns=['name', 'aspect', 'fold', 'top_k'] + metrics) # Dataset test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='relations', cache_dir='./data/nlp_cache', split=get_test_split(aspect, fold)) logger.info(f'Test samples: {len(test_ds):,}') # Unique paper IDs in test set test_paper_ids = set(test_ds['from_paper_id']).union( set(test_ds['to_paper_id'])) logger.info(f'Test paper IDs: {len(test_paper_ids):,}') logger.info(f'Examples: {list(test_paper_ids)[:10]}') # Relevance mapping doc_id2related_ids = defaultdict(set) # type: Dict[Set[str]] for row in test_ds: if row['label'] == 'y': a = row['from_paper_id'] b = row['to_paper_id'] doc_id2related_ids[a].add(b) doc_id2related_ids[b].add(a) # Filter for documents in test set test_doc_model = KeyedVectors(vector_size=doc_model.vector_size) test_doc_ids = [] test_doc_vectors = [] missed_doc_ids = 0 for doc_id in doc_model.vocab: if doc_id in test_paper_ids: vec = doc_model.get_vector(doc_id) if len(vec) != doc_model.vector_size: raise ValueError( f'Test document as invalid shape: {doc_id} => {vec.shape}' ) test_doc_ids.append(doc_id) test_doc_vectors.append(vec) else: missed_doc_ids += 1 # logger.warning(f'Document ID is not part of test set: {doc_id} ({type(doc_id)})') if len(test_doc_ids) != len(test_doc_vectors): raise ValueError( f'Test document IDs does not match vector count: {len(test_doc_ids)} vs {len(test_doc_vectors)}' ) logger.info( f'Test document IDs: {len(test_doc_ids)} (missed {missed_doc_ids})' ) logger.info(f'Test document vectors: {len(test_doc_vectors)}') test_doc_model.add(test_doc_ids, test_doc_vectors) test_doc_model.init_sims(replace=True) logger.info(f'Test document vectors: {test_doc_model.vectors.shape}') # Actual evaluation # k2eval_rows = defaultdict(list) seed_ids_without_recommendations = [] max_top_k = max(top_ks) eval_rows = {top_k: defaultdict(list) for top_k in top_ks } # top_k => metric_name => list of value seed_id2ret_docs = {} for seed_id in tqdm( test_paper_ids, desc=f'Evaluation ({name},aspect={aspect},fold={fold})'): try: rel_docs = doc_id2related_ids[seed_id] max_ret_docs = [ d for d, score in test_doc_model.most_similar(seed_id, topn=max_top_k) ] seed_id2ret_docs[seed_id] = max_ret_docs for top_k in top_ks: ret_docs = max_ret_docs[:top_k] rel_ret_docs_count = len(set(ret_docs) & set(rel_docs)) if ret_docs and rel_docs: # Precision = No. of relevant documents retrieved / No. of total documents retrieved precision = rel_ret_docs_count / len(ret_docs) # Recall = No. of relevant documents retrieved / No. of total relevant documents recall = rel_ret_docs_count / len(rel_docs) # Avg. precision (for MAP) avg_p = get_avg_precision(ret_docs, rel_docs) # Reciprocal rank (for MRR) reciprocal_rank = get_reciprocal_rank( ret_docs, rel_docs) # # NDCG@k predicted_relevance = [ 1 if ret_doc_id in rel_docs else 0 for ret_doc_id in ret_docs ] true_relevances = [1] * len(rel_docs) ndcg_value = compute_dcg_at_k( predicted_relevance, top_k) / compute_dcg_at_k( true_relevances, top_k) # Save metrics eval_rows[top_k]['retrieved_docs'].append( len(ret_docs)) eval_rows[top_k]['relevant_docs'].append(len(rel_docs)) eval_rows[top_k]['relevant_retrieved_docs'].append( rel_ret_docs_count) eval_rows[top_k]['precision'].append(precision) eval_rows[top_k]['recall'].append(recall) eval_rows[top_k]['avg_p'].append(avg_p) eval_rows[top_k]['reciprocal_rank'].append( reciprocal_rank) eval_rows[top_k]['ndcg'].append(ndcg_value) except (IndexError, ValueError, KeyError) as e: seed_ids_without_recommendations.append(seed_id) logger.warning( f'Cannot retrieve recommendations for #{seed_id}: {e}') logger.info( f'Completed with {len(eval_rows[top_ks[0]][metrics[0]]):,} rows (missed {len(seed_ids_without_recommendations):,})' ) # Summarize evaluation for top_k in top_ks: try: row = [name, aspect, fold, top_k] for metric in metrics: # mean over all metrics values = eval_rows[top_k][metric] if len(values) > 0: row.append(np.mean(values)) else: row.append(None) df.loc[len(df)] = row except ValueError as e: logger.error( f'Cannot summarize row: {top_k} {fold} {metrics} {e}') return df, seed_id2ret_docs
def evaluate_vectors(hf_dataset: str, aspect: str, input_path: str, name: str, folds: Union[str, list], top_ks: Union[str, list], output_path: str): """ Run with: $ ./eval_cli.py evaluate_vectors paperswithcode_aspects task ./output/pwc_doc_id2st.txt --name=sentence_transformers --folds=1,2,3,4 --top_ks=5,10,25,50 --output_path=./output/eval.csv :param aspect: :param folds: :param top_ks: :param name: :param hf_dataset: :param input_path: :param output_path: :return: """ if isinstance(folds, str): folds = folds.split(',') elif isinstance(folds, int): folds = [folds] if isinstance(top_ks, str): top_ks = top_ks.split(',') elif isinstance(top_ks, int): top_ks = [top_ks] logger.info(f'Folds: {folds}') logger.info(f'Top-Ks: {top_ks}') if len(folds) < 1: logger.error('No folds provided') return if len(top_ks) < 1: logger.error('No top-k values provided') return # Load documents doc_model = KeyedVectors.load_word2vec_format(input_path) logger.info(f'Document vectors: {doc_model.vectors.shape}') # Normalize vectors doc_model.init_sims(replace=True) # Init dataframe metrics = [ 'retrieved_docs', 'relevant_docs', 'relevant_retrieved_docs', 'precision', 'recall', 'avg_p', 'reciprocal_rank' ] df = pd.DataFrame([], columns=['name', 'fold', 'top_k'] + metrics) # Iterate over folds for fold in folds: logger.info(f'Current fold: {fold}') # Dataset test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='relations', cache_dir='./data/nlp_cache', split=get_test_split(aspect, fold)) logger.info(f'Test samples: {len(test_ds):,}') # Unique paper IDs in test set test_paper_ids = set(test_ds['from_paper_id']).union( set(test_ds['to_paper_id'])) logger.info(f'Test paper IDs: {len(test_paper_ids):,}') logger.info(f'Examples: {list(test_paper_ids)[:10]}') # Relevance mapping doc_id2related_ids = defaultdict(set) # type: Dict[Set[str]] for row in test_ds: if row['label'] == 'y': a = row['from_paper_id'] b = row['to_paper_id'] doc_id2related_ids[a].add(b) doc_id2related_ids[b].add(a) # Filter for documents in test set test_doc_model = KeyedVectors(vector_size=doc_model.vector_size) test_doc_ids = [] test_doc_vectors = [] missed_doc_ids = 0 for doc_id in doc_model.vocab: if doc_id in test_paper_ids: vec = doc_model.get_vector(doc_id) if len(vec) != doc_model.vector_size: raise ValueError( f'Test document as invalid shape: {doc_id} => {vec.shape}' ) test_doc_ids.append(doc_id) test_doc_vectors.append(vec) else: missed_doc_ids += 1 # logger.warning(f'Document ID is not part of test set: {doc_id} ({type(doc_id)})') if len(test_doc_ids) != len(test_doc_vectors): raise ValueError( f'Test document IDs does not match vector count: {len(test_doc_ids)} vs {len(test_doc_vectors)}' ) logger.info( f'Test document IDs: {len(test_doc_ids)} (missed {missed_doc_ids})' ) logger.info(f'Test document vectors: {len(test_doc_vectors)}') test_doc_model.add(test_doc_ids, test_doc_vectors) test_doc_model.init_sims(replace=True) logger.info(f'Test document vectors: {test_doc_model.vectors.shape}') # Actual evaluation # k2eval_rows = defaultdict(list) seed_ids_without_recommendations = [] max_top_k = max(top_ks) eval_rows = {top_k: defaultdict(list) for top_k in top_ks } # top_k => metric_name => list of value for seed_id in tqdm(test_paper_ids, desc=f'Evaluation (fold={fold})'): try: rel_docs = doc_id2related_ids[seed_id] max_ret_docs = [ d for d, score in test_doc_model.most_similar(seed_id, topn=max_top_k) ] for top_k in top_ks: ret_docs = max_ret_docs[:top_k] rel_ret_docs_count = len(set(ret_docs) & set(rel_docs)) if ret_docs and rel_docs: # Precision = No. of relevant documents retrieved / No. of total documents retrieved precision = rel_ret_docs_count / len(ret_docs) # Recall = No. of relevant documents retrieved / No. of total relevant documents recall = rel_ret_docs_count / len(rel_docs) # Avg. precision (for MAP) avg_p = get_avg_precision(ret_docs, rel_docs) # Reciprocal rank (for MRR) reciprocal_rank = get_reciprocal_rank( ret_docs, rel_docs) # # NDCG@k # predicted_relevance = [1 if ret_doc_id in rel_docs else 0 for ret_doc_id in ret_docs] # true_relevances = [1] * len(rel_docs) # ndcg_value = self.compute_dcg_at_k(predicted_relevance, top_k) / self.compute_dcg_at_k(true_relevances, top_k) # Save metrics eval_rows[top_k]['retrieved_docs'].append( len(ret_docs)) eval_rows[top_k]['relevant_docs'].append(len(rel_docs)) eval_rows[top_k]['relevant_retrieved_docs'].append( rel_ret_docs_count) eval_rows[top_k]['precision'].append(precision) eval_rows[top_k]['recall'].append(recall) eval_rows[top_k]['avg_p'].append(avg_p) eval_rows[top_k]['reciprocal_rank'].append( reciprocal_rank) except (IndexError, ValueError, KeyError) as e: seed_ids_without_recommendations.append(seed_id) logger.warning( f'Cannot retrieve recommendations for #{seed_id}: {e}') logger.info( f'Completed with {len(eval_rows[top_ks[0]][metrics[0]]):,} rows (missed {len(seed_ids_without_recommendations):,})' ) # Summarize evaluation for top_k in top_ks: try: row = [name, fold, top_k] for metric in metrics: # mean over all metrics values = eval_rows[top_k][metric] if len(values) > 0: row.append(np.mean(values)) else: row.append(None) df.loc[len(df)] = row except ValueError as e: logger.error( f'Cannot summarize row: {top_k} {fold} {metrics} {e}') # # # df = pd.DataFrame(k2eval_rows[top_k], # columns=['seed_id', 'retrieved_docs', 'relevant_docs', 'relevant_retrieved_docs', # 'precision', 'recall', 'avg_p', 'reciprocal_rank']) # # print(df.mean()) # # print(df.mean().to_frame().transpose().iloc[0]) logger.info(f'Writing {len(df)} rows to {output_path}') if os.path.exists(output_path): # Append new rows to evaluation file df.to_csv(output_path, mode='a', header=False, index=False) else: # Write new files df.to_csv(output_path, header=True, index=False) logger.info('Done')
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: """ This is called during training to evaluate the model. It returns a score for the evaluation with a higher score indicating a better result. :param model: the model to evaluate :param output_path: path where predictions and metrics are written to :param epoch the epoch where the evaluation takes place. This is used for the file prefixes. If this is -1, then we assume evaluation on test data. :param steps the steps in the current epoch at time of the evaluation. This is used for the file prefixes. If this is -1, then we assume evaluation at the end of the epoch. :return: a score for the evaluation with a higher score indicating a better result """ # idx2paper_id = {} # paper_id2idx = {} # texts = [] # paper_ids = [] # # # get document texts # for idx, paper_id in enumerate(self.test_paper_ids): # idx2paper_id[idx] = paper_id # paper_id2idx[paper_id] = idx # # doc = self.doc_id2doc[paper_id] # texts.append(doc['title'] + ': ' + doc['abstract']) # paper_ids.append(paper_id) logger.info('Encode test documents...') embeddings = model.encode(self.tokenized_texts, is_pretokenized=True, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True) # Filter for documents in test set test_doc_model = KeyedVectors( vector_size=model.get_sentence_embedding_dimension()) #for idx, embedding in enumerate(embeddings): # test_doc_model.add([idx2paper_id[idx]], [embedding]) test_doc_model.add(self.paper_ids, embeddings.tolist()) test_doc_model.init_sims(replace=True) logger.info(f'Test document vectors: {test_doc_model.vectors.shape}') # Init dataframe metrics = [ 'retrieved_docs', 'relevant_docs', 'relevant_retrieved_docs', 'precision', 'recall', 'avg_p', 'reciprocal_rank', 'ndcg' ] df = pd.DataFrame([], columns=['epoch', 'steps', 'top_k'] + metrics) max_top_k = max(self.top_ks) eval_rows = {top_k: defaultdict(list) for top_k in self.top_ks } # top_k => metric_name => list of value seed_ids_without_recommendations = [] for seed_id in tqdm(self.test_paper_ids, desc=f'Evaluation'): try: rel_docs = self.doc_id2related_ids[seed_id] max_ret_docs = [ d for d, score in test_doc_model.most_similar(seed_id, topn=max_top_k) ] for top_k in self.top_ks: ret_docs = max_ret_docs[:top_k] rel_ret_docs_count = len(set(ret_docs) & set(rel_docs)) if ret_docs and rel_docs: # Precision = No. of relevant documents retrieved / No. of total documents retrieved precision = rel_ret_docs_count / len(ret_docs) # Recall = No. of relevant documents retrieved / No. of total relevant documents recall = rel_ret_docs_count / len(rel_docs) # Avg. precision (for MAP) avg_p = get_avg_precision(ret_docs, rel_docs) # Reciprocal rank (for MRR) reciprocal_rank = get_reciprocal_rank( ret_docs, rel_docs) # NDCG@k predicted_relevance = [ 1 if ret_doc_id in rel_docs else 0 for ret_doc_id in ret_docs ] true_relevances = [1] * len(rel_docs) ndcg_value = self.compute_dcg_at_k( predicted_relevance, top_k) / self.compute_dcg_at_k( true_relevances, top_k) # Save metrics eval_rows[top_k]['retrieved_docs'].append( len(ret_docs)) eval_rows[top_k]['relevant_docs'].append(len(rel_docs)) eval_rows[top_k]['relevant_retrieved_docs'].append( rel_ret_docs_count) eval_rows[top_k]['precision'].append(precision) eval_rows[top_k]['recall'].append(recall) eval_rows[top_k]['avg_p'].append(avg_p) eval_rows[top_k]['reciprocal_rank'].append( reciprocal_rank) eval_rows[top_k]['ndcg'].append(ndcg_value) except (IndexError, ValueError, KeyError) as e: seed_ids_without_recommendations.append(seed_id) logger.warning( f'Cannot retrieve recommendations for #{seed_id}: {e}') logger.info( f'Completed with {len(eval_rows[self.top_ks[0]][metrics[0]]):,} rows (missed {len(seed_ids_without_recommendations):,})' ) # Summarize evaluation for top_k in self.top_ks: try: row = [epoch, steps, top_k] for metric in metrics: # mean over all metrics values = eval_rows[top_k][metric] if len(values) > 0: row.append(np.mean(values)) else: row.append(None) df.loc[len(df)] = row except ValueError as e: logger.error(f'Cannot summarize row: {top_k} {metrics} {e}') output_csv_path = os.path.join(output_path, self.csv_file) logger.info(f'Writing {len(df)} rows to {output_csv_path}') logger.info(f'Results:\n{df.to_markdown()}') if os.path.exists(output_csv_path): # Append new rows to evaluation file df.to_csv(output_csv_path, mode='a', header=False, index=False) else: # Write new files df.to_csv(output_csv_path, header=True, index=False) # Return score from main metric if len(df) > 0: main_score = df.iloc[0][self.main_metric] logger.info( f'Evaluation completed: {self.main_metric} = {main_score}') return main_score else: logger.warning('No evaluation rows available... score = 0') return 0