예제 #1
0
def show_similar(embeds, labels, n_examples=10, n_nearby=6):
    # Gather a random set of queries (sentences embeddings that we'll compare against)
    query_label_idx = random.randint(0, len(labels) - n_examples)
    query_vectors = embeds[query_label_idx:query_label_idx + n_examples, :]
    # Find indices for the embeddings that are nearest to the queries
    t = time.time()
    indices, dists = _find_nearest(query_vectors,
                                   embeds,
                                   n_nearby,
                                   batch_size=1000000)
    print indices.shape, dists.shape
    t = time.time() - t
    query_sentences = labels[query_label_idx:query_label_idx + dists.shape[0]]
    for query_num, query_sentence in enumerate(query_sentences):
        print "*******************************************************************"
        print query_sentence
        dist_row = dists[query_num, :]
        index_row = indices[query_num, :]
        for dist, idx in zip(dist_row, index_row):
            print dist, labels[idx]
    print 'Took {} seconds ({} s/per query)'.format(t, t / n_examples)
    return

    # TODO: gensim is much faster but doesn't handle duplicate entries very well, and goes OOM
    kv = KeyedVectors(embeds.shape[-1])
    kv.add(labels, embeds)
    random_labels = random.sample(labels, 10)
    for label in random_labels:
        print label
        for tup in kv.most_similar(label):
            print tup
        print '--------------------------------------------------'
예제 #2
0
def is_correct(kv: KeyedVectors, analogy: Analogy, at: int = 1) -> bool:
    pos = analogy[0].split('/') + analogy[2].split('/')
    neg = analogy[1].split('/')
    tgt = analogy[3].split('/')

    sim_words = set(
        w for w, _ in kv.most_similar(positive=pos, negative=neg, topn=at))
    return any(t in sim_words for t in tgt)
예제 #3
0
def plot_word_combined(word: str, embedding: KeyedVectors, pca_model: PCA,
                       modifiers: [str]):
    res = []
    for i, mod in enumerate(modifiers):
        words_vectors = {}
        words_vectors[word + mod] = embedding[word + mod]
        similar = embedding.most_similar(positive=[word + mod], topn=10)
        for sim_word, _ in similar:
            words_vectors[sim_word] = embedding[sim_word]
        res.append(words_vectors)

    # Apply dimensionality reduction before plotting
    # not in use anymore because it didn't help much when analyzing the outputs
    # Instead we just produce an output string
    #words_vectors_dim_reduced = {word: pca_model.transform(words_vectors[word].reshape(1, -1)) for word in words_vectors}
    res_string = ""
    for i, dct in enumerate(res):
        res_string += "Word:" + word + modifiers[i]
        res_string += str(list(dct.keys()))
        res_string += '\n'
    return res_string
예제 #4
0
                    elif (args.entities == 'outin'
                          and args.elentities == 'inout'
                          or args.entities == 'inout'
                          and args.elentities == 'outin'):
                        entity_vec = entityv.word_vec(entity,
                                                      use_norm=args.norm)
                        positive.append(
                            np.concatenate(
                                (entity_vec[model.vector_size:],
                                 entity_vec[:model.vector_size])) * score)
                    else:
                        raise Exception("Configuration is not supported")
                else:
                    print(
                        'entity {} doesn\'t have an embedding'.format(entity))
        if not positive:
            print(
                'No vocab tokens for query {}: {}! Using zero vector for "positive".'
                .format(qid, ' '.join(qtokens)))
            positive.append(np.zeros(entityv.vector_size))
        for i, (entity, score) in enumerate(
                entityv.most_similar(positive=positive, topn=1000)):
            print(qid,
                  'Q0',
                  entity,
                  i + 1,
                  score,
                  'kewer',
                  sep=' ',
                  file=out_file)
예제 #5
0
파일: ex9.py 프로젝트: mat-hek/pjn
def most_similar(expr, wv: KeyedVectors):
    return wv.most_similar(parse(expr), topn=3)
예제 #6
0
            lower_cas.append(i)

# print(lower_cas)
for i in vip:
    lower_cas.append(i)
# print(lower_cas)
lower_ca = []
for i in lower_cas:
    pattern = re.compile(r'\d[:/]\d[:/]\d\d\d\d')
    matches = pattern.finditer(i)
    count = 0
    for match in matches:
        count += 1
    if count == 0:
        lower_ca.append(i)

final_lst = []
final_lst.append(lower_ca)
print(final_lst)

model = Word2Vec(final_lst, size=1, window=5, min_count=1, workers=4)
word_vectors = model.wv
fname = get_tmpfile("vectors.kv")
# word_vectors.save(fname)
# print(model.predict_output_word(['dsds']))
# print(model.most_similar('issu'))
# tok=['dsds','gsd']
word_vectors = KeyedVectors.load(fname, mmap='r')
# print(KeyedVectors.most_similar_to_given(self=word_vectors, entity1=tok,entities_list=['PriSize Estimation', 'Other Tools', 'AVMCommonPortal_L2']))
print(KeyedVectors.most_similar(self=word_vectors, positive=['sender']))
예제 #7
0
# text += print_info_length(corpus_labels, lines_corpus_splitted, "corpus docs" + conf, "words", True)
text += print_info_length(queries_labels, lines_queries_splitted,
                          "queries" + conf, "words", True)

text += '\n' + str(corpus_model)

print("done.")

w1 = "night"

outv = KeyedVectors(300)
outv.vocab = corpus_model.wv.vocab  # same
outv.index2word = corpus_model.wv.index2word  # same
outv.syn0 = corpus_model.syn1neg  # different

text += '\nIN EMBEDDINGS COMPARISON:\n' + str(
    corpus_model.wv.most_similar(positive=[corpus_model[w1]], topn=6))
print("IN-IN done.")
text += '\nOUT EMBEDDINGS COMPARISON:\n' + str(
    outv.most_similar(positive=[outv[w1]], topn=6))
print("OUT-OUT done.")
text += '\nIN-OUT EMBEDDINGS COMPARISON:\n' + str(
    corpus_model.wv.most_similar(positive=[outv[w1]], topn=6))
print("IN-OUT done.")
text += '\nOUT-IN EMBEDDINGS COMPARISON:\n' + str(
    outv.most_similar(positive=[corpus_model[w1]], topn=6))
print("OUT-IN done.")

with open("data_analysis/data_analysis" + conf + ".txt", 'w') as file:
    file.write(text)
예제 #8
0
        elif not (entity.startswith('entity:')
                  or entity.startswith('relation:')):
            wordv_entities.append(entity)
            wordv_weights.append(embedding)

print('entities:', entityv_entities[:4])
print('words:', wordv_entities[:4])

entityv = KeyedVectors(entityv_weights[0].shape[0])
entityv.add(entityv_entities, entityv_weights)

wordv = KeyedVectors(wordv_weights[0].shape[0])
wordv.add(wordv_entities, wordv_weights)
wordv.init_sims()

print(entityv.most_similar(positive=[wordv['detroit']]))

with open(args.outfile, 'w') as out_file:
    for qid, qtokens in queries.items():
        if args.el:
            if args.elremove:
                for entity in qid_entities[qid]['entities']:
                    if '<{}>'.format(entity) in entityv:
                        qtokens = list(
                            set(qtokens) -
                            set(qid_entities[qid]['surface_tokens'][entity]))
                    else:
                        print(
                            'not removing tokens for entity {} because it doesn\'t have an embedding'
                            .format(entity))
            elif args.elremoveall and qid_entities[qid]['entities']:
예제 #9
0
파일: clusters.py 프로젝트: zxlzr/spikex
def cluster_balls(
    model: KeyedVectors,
    root: str = None,
    max_size: int = None,
    min_score: float = None,
):
    """
    Cluster a model's keys by applying a revisited Radial Ball Mapper algorithm.

    A root key should be specified in case a point of interest is known.
    Not specifying any root key, a random one is picked from the model.

    If no otherwise specified, a `max_size` of 30 is used by default.

    if no otherwise specified, a `min_score` calculated as mean of all best similarities,
    minus a gap of 0.05, is used by default.

    Parameters
    ----------
    model : KeyedVectors
        Word2Vec model which stores all keys and vectors.
    root : str
        Point of interest from which to start clustering balls, by default None.
    max_size : int, optional
        Maximum size of a ball in terms of number of keys, by default None.
    min_score : float, optional
        Minimum similarity threshold for starting a cluster, by default None.

    Returns
    -------
    List[List[str]]
        Clusters of keys
    """
    if root is None:
        rand_i = randrange(0, len(model.index_to_key))
        root = model.index_to_key[rand_i]
    elif root not in model:
        return
    max_size = max_size or 30
    neighs = model.most_similar(root, topn=max_size)
    if not neighs:
        return
    if min_score is None:
        mean = _get_neighs_mean_score(model, neighs)
        min_score = min(neighs[0][1], mean - 0.10)
    clusters = []
    root_cluster = {root}
    seen = {root: (root_cluster, 1)}
    for n, s in neighs:
        if n in seen:
            continue
        if s >= min_score:
            root_cluster.add(n)
            seen.setdefault(n, (root_cluster, s))
            continue
        cluster = set()
        min_sub_score = min_score + 0.10
        for nn, ss in model.most_similar(n, topn=max_size):
            if nn in seen:
                c, b = seen[nn]
                if c == root_cluster or b >= ss:
                    continue
            if ss >= min_sub_score:
                if nn in seen:
                    prev_cluster = seen[nn][0]
                    prev_cluster.remove(nn)
                cluster.add(nn)
                seen[nn] = (cluster, ss)
        cluster.add(n)
        seen.setdefault(n, (cluster, 1))
        clusters.append(cluster)
        if len(cluster) < 3:
            continue
        intruder = _get_intruder(model, cluster)
        if intruder is None:
            continue
        del seen[intruder]
        cluster.remove(intruder)
    clusters.insert(0, root_cluster)
    return clusters
예제 #10
0
class PoemsModel:
    morph_analyzer = pymorphy2.MorphAnalyzer()

    def __init__(self, poems_model_file='', w2v_file=''):
        self.w2v = KeyedVectors()

        self.poems = []  # [str, str, ...]
        self.bags = []  # [[str, str, ...], ...]
        self.vocab = {}  # {word: count, ...}
        self.matrices = []  # [np.ndarray, ...]

        self.grammar_map = grammar_map_POS_TAGS

        if w2v_file:
            self.load_w2v_model(w2v_file)
        if poems_model_file:
            self.read(poems_model_file)

    def load_w2v_model(self, file_name: str) -> None:
        print("loading w2v_model...")
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.INFO)
        self.w2v = KeyedVectors.load_word2vec_format(file_name,
                                                     binary=True,
                                                     encoding='utf-8')
        print("word2vec model '%s' loaded" % file_name)

    def canonize_words(self, words: list) -> list:
        stop_words = ('быть', 'мой', 'наш', 'ваш', 'их', 'его', 'её', 'их',
                      'этот', 'тот', 'где', 'который', 'либо', 'нибудь', 'нет',
                      'да')

        normalized = []
        for w in words:
            forms = self.morph_analyzer.parse(w.lower())
            try:
                form = max(forms,
                           key=lambda x: (x.score, x.methods_stack[0][2]))
            except Exception:
                form = forms[0]
                print(form)
            if not (form.tag.POS in ['PREP', 'CONJ', 'PRCL', 'NPRO', 'NUMR']
                    or 'Name' in form.tag or 'UNKN' in form.tag
                    or form.normal_form in stop_words):  # 'ADJF'
                norm_word = form.normal_form.replace("ё", "е")
                normalized.append(norm_word +
                                  self.grammar_map.get(form.tag.POS, ''))
        return normalized

    def semantic_associations(self, bag: list, topn=10) -> list:
        positive_lst = [w for w in bag if w in self.w2v.vocab]
        if len(positive_lst) > 0:
            assoc_lst = self.w2v.most_similar(positive=positive_lst, topn=topn)
            return [a[0] for a in assoc_lst]
        else:
            print('empty association for bag:', bag)
            return []

    def bag_to_matrix(self, bag: list):
        mx = []
        for i in range(len(bag)):
            try:
                mx.append(self.w2v[bag[i]])
            except:
                pass
        return np.vstack(mx) if len(mx) > 0 else np.array([])

    @staticmethod
    def read_poems(file_name: str) -> list:
        file = open(file_name, encoding='utf-8')
        lines = file.readlines()
        poems = []
        poem = ""
        for line in lines:
            if len(line.strip()) == 0:
                if len(poem.strip()) > 0:
                    poems.append(poem.lower())
                    poem = ""
            else:
                poem += line
        return poems

    @staticmethod
    def remove_punctuation(text: str) -> str:
        return re.sub(
            r""",|\.|!|\?|;|"|@|#|%|&|\*|\\|/|:|\+|-|'|\(|\)|\[|\]""", ' ',
            text)

    def make_bags(self, texts: list) -> (list, dict):
        bags = []
        vocabulary = {}
        for txt in texts:
            bag = []  # {}
            clear_txt = self.remove_punctuation(txt)
            words = self.canonize_words(clear_txt.split())
            for w in words:
                if w not in bag:
                    bag.append(w)  # bag[w] = bag.get(w, 0) + 1
                vocabulary[w] = vocabulary.get(w, 0) + 1
            bags.append(bag)
        return bags, vocabulary

    def compile(self,
                poems_file: str = "",
                w2v_file: str = "",
                poems_reader: Callable[[str], list] = None) -> None:
        if poems_file:
            if poems_reader is None:
                poems_reader = self.read_poems
            self.poems = poems_reader(poems_file)
            print('poem count:', len(self.poems))

        print('making word bags...')
        self.bags, self.vocab = self.make_bags(self.poems)

        if w2v_file:
            self.load_w2v_model(w2v_file)
        print("model is compiled")

    def read(self, file_name: str) -> None:
        with open(file_name, mode='rb') as file:
            print('reading pickle poems model...')
            data = pickle.load(file)
            self.poems = data['poems']
            self.bags = data['bags']
            self.vocab = data['vocab']

            print("vectorizing model...")
            self.matrices = [self.bag_to_matrix(bag) for bag in self.bags]

            print('model is loaded')

    def write(self, file_name: str) -> None:
        with open(file_name, mode='wb') as file:
            data = {
                'poems': self.poems,
                'bags': self.bags,
                'vocab': self.vocab,
            }
            pickle.dump(data, file)

    def most_similar(self, positive="", negative="", topn=10) -> list:
        pos_bag = self.canonize_words(positive.split())
        neg_bag = self.canonize_words(negative.split())
        return self.w2v.most_similar(pos_bag, neg_bag,
                                     topn) if len(positive) > 0 else ()

    def semantic_levels(self,
                        base_word: str,
                        portions=[0.2, 0.3, 0.5],
                        vocab_count=100) -> (dict, np.ndarray):
        if base_word not in self.w2v.vocab:
            return {}, []
        levels = dict()
        levels[0] = {base_word}
        for level in range(1, len(portions) + 1):
            levels[level] = set()
        matrix = [self.w2v.word_vec(base_word)]

        similars = self.w2v.most_similar(base_word, topn=vocab_count)

        notch = 0
        for level, portion in enumerate(portions):
            next_notch = notch + int(vocab_count * portion)
            for i in range(notch, next_notch):
                word = similars[i][0]
                levels[level + 1].add(word)
                matrix.append(self.w2v.word_vec(word) * (1 - portions[level]))
            notch = next_notch

        return levels, np.vstack(matrix)

    @staticmethod
    @numba.jit
    def semantic_similarity_fast_log(mx1: np.ndarray,
                                     mx2: np.ndarray) -> float:
        return np.sum(np.dot(mx1, mx2.T)) * np.log10(mx2.size) / (mx2.size + mx1.size) \
               if mx1.size > 0 and mx2.size > 0 else 0.0

    @staticmethod
    @numba.jit
    def semantic_similarity_fast(mx1: np.ndarray, mx2: np.ndarray) -> float:
        return np.sum(np.dot(mx1, mx2.T)) / (mx2.size + mx1.size) \
               if mx1.size > 0 and mx2.size > 0 else 0.0

    def similar_poems_idx(self, query, topn=5) -> list:  # [(poem_idx, sim)]
        query_mx = query
        if type(query) == str:
            clear_query = self.remove_punctuation(query)
            query_bag = self.canonize_words(clear_query.split())
            query_mx = self.bag_to_matrix(query_bag)
        if len(query_mx) == 0:
            return []
        similars = [(i, self.semantic_similarity_fast_log(query_mx, mx))
                    for i, mx in enumerate(self.matrices)]
        # similars.sort(key=lambda x: x[1], reverse=True)
        return heapq.nlargest(topn, similars, key=lambda x: x[1])

    def similar_poems(self, query, topn=5) -> list:  # [(poem, sim)]
        return [(self.poems[idx], sim)
                for idx, sim in self.similar_poems_idx(query, topn)]
예제 #11
0
class VectorSpaceModel(object):

    """Base class for models that represent words as vectors.

    For now, this really is just a wrapper around the Gensim KeyedVectors / Word2Vec class.

    """

    def __init__(self, name=None):
        self.name = name
        self.m = KeyedVectors()
        return

    @classmethod
    def load(cls, filename, modelname=None, **kwargs):
        if filename.endswith('.pkl'):
            model = cls.load_pickle(filename, modelname=modelname, **kwargs)
        else:
            model = cls.load_w2v(filename, modelname=modelname, **kwargs)
        return model

    @classmethod
    def load_pickle(cls, filename, **kwargs):
        debug("Loading pickled model from file {:}".format(filename))
        model = pickle.load(filename)
        return model

    @classmethod
    def load_w2v(cls, filename, modelname=None, **kwargs):
        """Load the model from disk."""
        debug("Loading word2vec model from file {:}".format(filename))
        if filename.endswith(".bin"):
            m = KeyedVectors.load_word2vec_format(filename, binary=True)
        else:
            m = KeyedVectors.load_word2vec_format(filename)
        model = cls()
        model.m = m
        if modelname is None:
            modelname = os.path.basename(filename)
            modelname = re.sub('.bin', '', modelname)
        model.name = modelname
        return model

    def save_pickle(self, filename):
        debug("Saving model {:} to pickle file {:}".format(self.name, filename))
        pickle.dump(self, filename)
        return

    def __getitem__(self, word):
        return(self.m[word])

    def most_similar(self, query, k=5):
        """Return the most similar words to the query. `query` can be either a string or a
        vector. If it is a string, then its vector will be looked up in the current VSM.
        """
        if type(query) is str:
            results = self.m.most_similar(query, topn=k)
        else:
            results = self.m.similar_by_vector(query, topn=k)
        return results

    def __repr__(self):
        return "<VectorSpaceModel {:} with {:,} vectors>".format(repr(self.name), self.m.syn0.shape[0])
예제 #12
0
파일: eval_cli.py 프로젝트: j5bd/q
    def get_evaluation_df(name, doc_model, hf_dataset, aspect,
                          fold) -> Tuple[DataFrame, Dict]:
        # Init dataframe
        metrics = [
            'retrieved_docs', 'relevant_docs', 'relevant_retrieved_docs',
            'precision', 'recall', 'avg_p', 'reciprocal_rank', 'ndcg'
        ]
        df = pd.DataFrame([],
                          columns=['name', 'aspect', 'fold', 'top_k'] +
                          metrics)

        # Dataset
        test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                               name='relations',
                               cache_dir='./data/nlp_cache',
                               split=get_test_split(aspect, fold))

        logger.info(f'Test samples: {len(test_ds):,}')

        # Unique paper IDs in test set
        test_paper_ids = set(test_ds['from_paper_id']).union(
            set(test_ds['to_paper_id']))

        logger.info(f'Test paper IDs: {len(test_paper_ids):,}')
        logger.info(f'Examples: {list(test_paper_ids)[:10]}')

        # Relevance mapping
        doc_id2related_ids = defaultdict(set)  # type: Dict[Set[str]]
        for row in test_ds:
            if row['label'] == 'y':
                a = row['from_paper_id']
                b = row['to_paper_id']
                doc_id2related_ids[a].add(b)
                doc_id2related_ids[b].add(a)

        # Filter for documents in test set
        test_doc_model = KeyedVectors(vector_size=doc_model.vector_size)
        test_doc_ids = []
        test_doc_vectors = []
        missed_doc_ids = 0

        for doc_id in doc_model.vocab:
            if doc_id in test_paper_ids:
                vec = doc_model.get_vector(doc_id)
                if len(vec) != doc_model.vector_size:
                    raise ValueError(
                        f'Test document as invalid shape: {doc_id} => {vec.shape}'
                    )

                test_doc_ids.append(doc_id)
                test_doc_vectors.append(vec)
            else:
                missed_doc_ids += 1
                # logger.warning(f'Document ID is not part of test set: {doc_id} ({type(doc_id)})')

        if len(test_doc_ids) != len(test_doc_vectors):
            raise ValueError(
                f'Test document IDs does not match vector count: {len(test_doc_ids)} vs {len(test_doc_vectors)}'
            )

        logger.info(
            f'Test document IDs: {len(test_doc_ids)} (missed {missed_doc_ids})'
        )
        logger.info(f'Test document vectors: {len(test_doc_vectors)}')

        test_doc_model.add(test_doc_ids, test_doc_vectors)
        test_doc_model.init_sims(replace=True)

        logger.info(f'Test document vectors: {test_doc_model.vectors.shape}')

        # Actual evaluation
        # k2eval_rows = defaultdict(list)
        seed_ids_without_recommendations = []
        max_top_k = max(top_ks)
        eval_rows = {top_k: defaultdict(list)
                     for top_k in top_ks
                     }  # top_k => metric_name => list of value

        seed_id2ret_docs = {}

        for seed_id in tqdm(
                test_paper_ids,
                desc=f'Evaluation ({name},aspect={aspect},fold={fold})'):
            try:
                rel_docs = doc_id2related_ids[seed_id]
                max_ret_docs = [
                    d
                    for d, score in test_doc_model.most_similar(seed_id,
                                                                topn=max_top_k)
                ]
                seed_id2ret_docs[seed_id] = max_ret_docs

                for top_k in top_ks:
                    ret_docs = max_ret_docs[:top_k]
                    rel_ret_docs_count = len(set(ret_docs) & set(rel_docs))

                    if ret_docs and rel_docs:
                        # Precision = No. of relevant documents retrieved / No. of total documents retrieved
                        precision = rel_ret_docs_count / len(ret_docs)

                        # Recall = No. of relevant documents retrieved / No. of total relevant documents
                        recall = rel_ret_docs_count / len(rel_docs)

                        # Avg. precision (for MAP)
                        avg_p = get_avg_precision(ret_docs, rel_docs)

                        # Reciprocal rank (for MRR)
                        reciprocal_rank = get_reciprocal_rank(
                            ret_docs, rel_docs)

                        # # NDCG@k
                        predicted_relevance = [
                            1 if ret_doc_id in rel_docs else 0
                            for ret_doc_id in ret_docs
                        ]
                        true_relevances = [1] * len(rel_docs)
                        ndcg_value = compute_dcg_at_k(
                            predicted_relevance, top_k) / compute_dcg_at_k(
                                true_relevances, top_k)

                        # Save metrics
                        eval_rows[top_k]['retrieved_docs'].append(
                            len(ret_docs))
                        eval_rows[top_k]['relevant_docs'].append(len(rel_docs))
                        eval_rows[top_k]['relevant_retrieved_docs'].append(
                            rel_ret_docs_count)
                        eval_rows[top_k]['precision'].append(precision)
                        eval_rows[top_k]['recall'].append(recall)
                        eval_rows[top_k]['avg_p'].append(avg_p)
                        eval_rows[top_k]['reciprocal_rank'].append(
                            reciprocal_rank)
                        eval_rows[top_k]['ndcg'].append(ndcg_value)

            except (IndexError, ValueError, KeyError) as e:
                seed_ids_without_recommendations.append(seed_id)

                logger.warning(
                    f'Cannot retrieve recommendations for #{seed_id}: {e}')

        logger.info(
            f'Completed with {len(eval_rows[top_ks[0]][metrics[0]]):,} rows (missed {len(seed_ids_without_recommendations):,})'
        )

        # Summarize evaluation
        for top_k in top_ks:
            try:
                row = [name, aspect, fold, top_k]
                for metric in metrics:
                    # mean over all metrics
                    values = eval_rows[top_k][metric]
                    if len(values) > 0:
                        row.append(np.mean(values))
                    else:
                        row.append(None)

                df.loc[len(df)] = row

            except ValueError as e:
                logger.error(
                    f'Cannot summarize row: {top_k} {fold} {metrics} {e}')

        return df, seed_id2ret_docs
예제 #13
0
파일: eval_cli.py 프로젝트: j5bd/q
def evaluate_vectors(hf_dataset: str, aspect: str, input_path: str, name: str,
                     folds: Union[str, list], top_ks: Union[str, list],
                     output_path: str):
    """

    Run with: $ ./eval_cli.py evaluate_vectors paperswithcode_aspects task ./output/pwc_doc_id2st.txt --name=sentence_transformers --folds=1,2,3,4 --top_ks=5,10,25,50 --output_path=./output/eval.csv

    :param aspect:
    :param folds:
    :param top_ks:
    :param name:
    :param hf_dataset:
    :param input_path:
    :param output_path:
    :return:
    """

    if isinstance(folds, str):
        folds = folds.split(',')
    elif isinstance(folds, int):
        folds = [folds]

    if isinstance(top_ks, str):
        top_ks = top_ks.split(',')
    elif isinstance(top_ks, int):
        top_ks = [top_ks]

    logger.info(f'Folds: {folds}')
    logger.info(f'Top-Ks: {top_ks}')

    if len(folds) < 1:
        logger.error('No folds provided')
        return

    if len(top_ks) < 1:
        logger.error('No top-k values provided')
        return

    # Load documents
    doc_model = KeyedVectors.load_word2vec_format(input_path)
    logger.info(f'Document vectors: {doc_model.vectors.shape}')

    # Normalize vectors
    doc_model.init_sims(replace=True)

    # Init dataframe
    metrics = [
        'retrieved_docs', 'relevant_docs', 'relevant_retrieved_docs',
        'precision', 'recall', 'avg_p', 'reciprocal_rank'
    ]
    df = pd.DataFrame([], columns=['name', 'fold', 'top_k'] + metrics)

    # Iterate over folds
    for fold in folds:
        logger.info(f'Current fold: {fold}')

        # Dataset
        test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                               name='relations',
                               cache_dir='./data/nlp_cache',
                               split=get_test_split(aspect, fold))

        logger.info(f'Test samples: {len(test_ds):,}')

        # Unique paper IDs in test set
        test_paper_ids = set(test_ds['from_paper_id']).union(
            set(test_ds['to_paper_id']))

        logger.info(f'Test paper IDs: {len(test_paper_ids):,}')
        logger.info(f'Examples: {list(test_paper_ids)[:10]}')

        # Relevance mapping
        doc_id2related_ids = defaultdict(set)  # type: Dict[Set[str]]
        for row in test_ds:
            if row['label'] == 'y':
                a = row['from_paper_id']
                b = row['to_paper_id']
                doc_id2related_ids[a].add(b)
                doc_id2related_ids[b].add(a)

        # Filter for documents in test set
        test_doc_model = KeyedVectors(vector_size=doc_model.vector_size)
        test_doc_ids = []
        test_doc_vectors = []
        missed_doc_ids = 0

        for doc_id in doc_model.vocab:
            if doc_id in test_paper_ids:
                vec = doc_model.get_vector(doc_id)
                if len(vec) != doc_model.vector_size:
                    raise ValueError(
                        f'Test document as invalid shape: {doc_id} => {vec.shape}'
                    )

                test_doc_ids.append(doc_id)
                test_doc_vectors.append(vec)
            else:
                missed_doc_ids += 1
                # logger.warning(f'Document ID is not part of test set: {doc_id} ({type(doc_id)})')

        if len(test_doc_ids) != len(test_doc_vectors):
            raise ValueError(
                f'Test document IDs does not match vector count: {len(test_doc_ids)} vs {len(test_doc_vectors)}'
            )

        logger.info(
            f'Test document IDs: {len(test_doc_ids)} (missed {missed_doc_ids})'
        )
        logger.info(f'Test document vectors: {len(test_doc_vectors)}')

        test_doc_model.add(test_doc_ids, test_doc_vectors)
        test_doc_model.init_sims(replace=True)

        logger.info(f'Test document vectors: {test_doc_model.vectors.shape}')

        # Actual evaluation
        # k2eval_rows = defaultdict(list)
        seed_ids_without_recommendations = []
        max_top_k = max(top_ks)
        eval_rows = {top_k: defaultdict(list)
                     for top_k in top_ks
                     }  # top_k => metric_name => list of value

        for seed_id in tqdm(test_paper_ids, desc=f'Evaluation (fold={fold})'):
            try:
                rel_docs = doc_id2related_ids[seed_id]
                max_ret_docs = [
                    d
                    for d, score in test_doc_model.most_similar(seed_id,
                                                                topn=max_top_k)
                ]
                for top_k in top_ks:
                    ret_docs = max_ret_docs[:top_k]
                    rel_ret_docs_count = len(set(ret_docs) & set(rel_docs))

                    if ret_docs and rel_docs:
                        # Precision = No. of relevant documents retrieved / No. of total documents retrieved
                        precision = rel_ret_docs_count / len(ret_docs)

                        # Recall = No. of relevant documents retrieved / No. of total relevant documents
                        recall = rel_ret_docs_count / len(rel_docs)

                        # Avg. precision (for MAP)
                        avg_p = get_avg_precision(ret_docs, rel_docs)

                        # Reciprocal rank (for MRR)
                        reciprocal_rank = get_reciprocal_rank(
                            ret_docs, rel_docs)

                        # # NDCG@k
                        # predicted_relevance = [1 if ret_doc_id in rel_docs else 0 for ret_doc_id in ret_docs]
                        # true_relevances = [1] * len(rel_docs)
                        # ndcg_value = self.compute_dcg_at_k(predicted_relevance, top_k) / self.compute_dcg_at_k(true_relevances, top_k)

                        # Save metrics
                        eval_rows[top_k]['retrieved_docs'].append(
                            len(ret_docs))
                        eval_rows[top_k]['relevant_docs'].append(len(rel_docs))
                        eval_rows[top_k]['relevant_retrieved_docs'].append(
                            rel_ret_docs_count)
                        eval_rows[top_k]['precision'].append(precision)
                        eval_rows[top_k]['recall'].append(recall)
                        eval_rows[top_k]['avg_p'].append(avg_p)
                        eval_rows[top_k]['reciprocal_rank'].append(
                            reciprocal_rank)

            except (IndexError, ValueError, KeyError) as e:
                seed_ids_without_recommendations.append(seed_id)

                logger.warning(
                    f'Cannot retrieve recommendations for #{seed_id}: {e}')

        logger.info(
            f'Completed with {len(eval_rows[top_ks[0]][metrics[0]]):,} rows (missed {len(seed_ids_without_recommendations):,})'
        )

        # Summarize evaluation
        for top_k in top_ks:
            try:
                row = [name, fold, top_k]
                for metric in metrics:
                    # mean over all metrics
                    values = eval_rows[top_k][metric]
                    if len(values) > 0:
                        row.append(np.mean(values))
                    else:
                        row.append(None)

                df.loc[len(df)] = row

            except ValueError as e:
                logger.error(
                    f'Cannot summarize row: {top_k} {fold} {metrics} {e}')

            #
            #
            # df = pd.DataFrame(k2eval_rows[top_k],
            #                   columns=['seed_id', 'retrieved_docs', 'relevant_docs', 'relevant_retrieved_docs',
            #                            'precision', 'recall', 'avg_p', 'reciprocal_rank'])
            #
            # print(df.mean())
            #
            # print(df.mean().to_frame().transpose().iloc[0])

    logger.info(f'Writing {len(df)} rows to {output_path}')

    if os.path.exists(output_path):
        # Append new rows to evaluation file
        df.to_csv(output_path, mode='a', header=False, index=False)
    else:
        # Write new files
        df.to_csv(output_path, header=True, index=False)

    logger.info('Done')
예제 #14
0
    def __call__(self,
                 model,
                 output_path: str = None,
                 epoch: int = -1,
                 steps: int = -1) -> float:
        """
        This is called during training to evaluate the model.
        It returns a score for the evaluation with a higher score indicating a better result.

        :param model:
            the model to evaluate
        :param output_path:
            path where predictions and metrics are written to
        :param epoch
            the epoch where the evaluation takes place.
            This is used for the file prefixes.
            If this is -1, then we assume evaluation on test data.
        :param steps
            the steps in the current epoch at time of the evaluation.
            This is used for the file prefixes.
            If this is -1, then we assume evaluation at the end of the epoch.
        :return: a score for the evaluation with a higher score indicating a better result
        """
        # idx2paper_id = {}
        # paper_id2idx = {}
        # texts = []
        # paper_ids = []
        #
        # # get document texts
        # for idx, paper_id in enumerate(self.test_paper_ids):
        #     idx2paper_id[idx] = paper_id
        #     paper_id2idx[paper_id] = idx
        #
        #     doc = self.doc_id2doc[paper_id]
        #     texts.append(doc['title'] + ': ' + doc['abstract'])
        #     paper_ids.append(paper_id)

        logger.info('Encode test documents...')
        embeddings = model.encode(self.tokenized_texts,
                                  is_pretokenized=True,
                                  batch_size=self.batch_size,
                                  show_progress_bar=self.show_progress_bar,
                                  convert_to_numpy=True)

        # Filter for documents in test set
        test_doc_model = KeyedVectors(
            vector_size=model.get_sentence_embedding_dimension())

        #for idx, embedding in enumerate(embeddings):
        #    test_doc_model.add([idx2paper_id[idx]], [embedding])
        test_doc_model.add(self.paper_ids, embeddings.tolist())

        test_doc_model.init_sims(replace=True)
        logger.info(f'Test document vectors: {test_doc_model.vectors.shape}')

        # Init dataframe
        metrics = [
            'retrieved_docs', 'relevant_docs', 'relevant_retrieved_docs',
            'precision', 'recall', 'avg_p', 'reciprocal_rank', 'ndcg'
        ]
        df = pd.DataFrame([], columns=['epoch', 'steps', 'top_k'] + metrics)

        max_top_k = max(self.top_ks)
        eval_rows = {top_k: defaultdict(list)
                     for top_k in self.top_ks
                     }  # top_k => metric_name => list of value
        seed_ids_without_recommendations = []

        for seed_id in tqdm(self.test_paper_ids, desc=f'Evaluation'):
            try:
                rel_docs = self.doc_id2related_ids[seed_id]
                max_ret_docs = [
                    d
                    for d, score in test_doc_model.most_similar(seed_id,
                                                                topn=max_top_k)
                ]

                for top_k in self.top_ks:
                    ret_docs = max_ret_docs[:top_k]
                    rel_ret_docs_count = len(set(ret_docs) & set(rel_docs))

                    if ret_docs and rel_docs:
                        # Precision = No. of relevant documents retrieved / No. of total documents retrieved
                        precision = rel_ret_docs_count / len(ret_docs)

                        # Recall = No. of relevant documents retrieved / No. of total relevant documents
                        recall = rel_ret_docs_count / len(rel_docs)

                        # Avg. precision (for MAP)
                        avg_p = get_avg_precision(ret_docs, rel_docs)

                        # Reciprocal rank (for MRR)
                        reciprocal_rank = get_reciprocal_rank(
                            ret_docs, rel_docs)

                        # NDCG@k
                        predicted_relevance = [
                            1 if ret_doc_id in rel_docs else 0
                            for ret_doc_id in ret_docs
                        ]
                        true_relevances = [1] * len(rel_docs)
                        ndcg_value = self.compute_dcg_at_k(
                            predicted_relevance,
                            top_k) / self.compute_dcg_at_k(
                                true_relevances, top_k)

                        # Save metrics
                        eval_rows[top_k]['retrieved_docs'].append(
                            len(ret_docs))
                        eval_rows[top_k]['relevant_docs'].append(len(rel_docs))
                        eval_rows[top_k]['relevant_retrieved_docs'].append(
                            rel_ret_docs_count)
                        eval_rows[top_k]['precision'].append(precision)
                        eval_rows[top_k]['recall'].append(recall)
                        eval_rows[top_k]['avg_p'].append(avg_p)
                        eval_rows[top_k]['reciprocal_rank'].append(
                            reciprocal_rank)
                        eval_rows[top_k]['ndcg'].append(ndcg_value)

            except (IndexError, ValueError, KeyError) as e:
                seed_ids_without_recommendations.append(seed_id)
                logger.warning(
                    f'Cannot retrieve recommendations for #{seed_id}: {e}')

        logger.info(
            f'Completed with {len(eval_rows[self.top_ks[0]][metrics[0]]):,} rows (missed {len(seed_ids_without_recommendations):,})'
        )

        # Summarize evaluation
        for top_k in self.top_ks:
            try:
                row = [epoch, steps, top_k]
                for metric in metrics:
                    # mean over all metrics
                    values = eval_rows[top_k][metric]
                    if len(values) > 0:
                        row.append(np.mean(values))
                    else:
                        row.append(None)

                df.loc[len(df)] = row

            except ValueError as e:
                logger.error(f'Cannot summarize row: {top_k} {metrics} {e}')

        output_csv_path = os.path.join(output_path, self.csv_file)

        logger.info(f'Writing {len(df)} rows to {output_csv_path}')
        logger.info(f'Results:\n{df.to_markdown()}')

        if os.path.exists(output_csv_path):
            # Append new rows to evaluation file
            df.to_csv(output_csv_path, mode='a', header=False, index=False)
        else:
            # Write new files
            df.to_csv(output_csv_path, header=True, index=False)

        # Return score from main metric
        if len(df) > 0:
            main_score = df.iloc[0][self.main_metric]
            logger.info(
                f'Evaluation completed: {self.main_metric} = {main_score}')
            return main_score
        else:
            logger.warning('No evaluation rows available... score = 0')
            return 0