Exemplo n.º 1
0
 def _from_kvs_to_wvs(self, kvs: KeyedVectors) -> WordVectors:
     wvs = WordVectors(
         kvs.name, kvs.vector_size, {
             word: np.array(kvs.get_vector(word))
             for word, vocab in kvs.vocab.items()
         })
     return wvs
Exemplo n.º 2
0
    def texts_to_vectors(self, wordvectors: KeyedVectors, descriptor_name: str, use_idf=False):
        """
        Map each document's word to a vector contained in wordvectors
        and then calculate a sentence vector by averaging all the vectors of the document.
        :param wordvectors: KeyedVector object with the loaded vectors
        :param descriptor_name: Name of the descriptor
        :return:
        """
        if use_idf:
            if self.dct is None:
                self.dct = Dictionary([x.split(" ") for x in self.x])
            if self.tfidf is None:
                self.tfidf = TfidfModel([self.dct.doc2bow(document.split(" ")) for document in self.x])

        new_x = []
        vectorized_counter = 0
        not_vectorized_counter = 0
        for document in self.x:
            document_vector_accum = None
            weight_accum = 0
            for word in document.split(" "):
                try:
                    vector = wordvectors.get_vector(word)
                    if use_idf:
                        try:
                            idf = self.tfidf.idfs[self.dct.token2id[word]]
                        except KeyError as e:
                            print("warning: idf not found for {}".format(word))
                            continue
                    else:
                        idf = 1  # simple mean
                    if document_vector_accum is None:
                        document_vector_accum = vector*idf
                    else:
                        document_vector_accum += vector*idf
                    weight_accum += idf
                    vectorized_counter += 1
                except KeyError:
                    # print("warning: word: \"{}\" not found in {} vectors".format(word, descriptor_name))
                    not_vectorized_counter += 1
                    continue
            document_vector_accum = document_vector_accum / weight_accum
            new_x.append(document_vector_accum)
        print("info: done converting. vectorized {}; skipped {}".format(vectorized_counter, not_vectorized_counter))
        return new_x
Exemplo n.º 3
0
    def test_load_multi(self):
        in_w2v_fp = TEST_DIR / 'doc2vec_w2v.txt'

        wv = KeyedVectors.load_word2vec_format(in_w2v_fp)
        glove = KeyedVectors.load_word2vec_format(
            TEST_DIR / 'glove.6B.200d.w2vformat.1k.txt')

        models = [wv, wv, wv, glove]

        target_vector_size = np.sum([m.vector_size for m in models])

        self.assertEqual(models[0].vector_size * 3 + 200, target_vector_size)

        # Build new keyed vector model
        model = KeyedVectors(vector_size=target_vector_size)

        # self.assertEqual([str(i) for i in range(5)], wv.index2word)

        # Iterate over all words (in first model)
        for doc_id in models[0].index2word:
            # print(type(doc_id))
            # Stack vectors from all models
            models_vec = []

            for m in models:
                if doc_id in m.index2word:
                    models_vec.append(m.get_vector(doc_id))
                else:
                    print(f'WARNING: {doc_id} does not exist in {m}')
                    models_vec.append(np.zeros((m.vector_size)))

            vec = np.hstack(models_vec)

            model.add(doc_id, vec)

        self.assertEqual(300 + 200, model.get_vector('0').shape[0])
Exemplo n.º 4
0
def check_embedding_coverage(vocabulary: Dict[str, int],
                             keyed_vectors: KeyedVectors):
    """See what words from the vocabulary are not represented in the word vectors.

    Output information about the OOV (out of vocabulary) terms.

    :param vocabulary: Dictionary with words as keys and frequencies of the words in the corpus as values.
    :param keyed_vectors: gensim.model.KeyedVectors instance containing the word vectors.
    :return:
    """
    cov = {}  # Covered words dictionary
    oov = {}  # Out of vocabulary dictionary
    covered_words = 0
    oov_words = 0

    for word in tqdm(vocabulary, desc="Words checked"):
        try:
            vector = keyed_vectors.get_vector(word)
            cov[word] = vector
            covered_words += vocabulary[word] if isinstance(vocabulary,
                                                            Dict) else word

        except KeyError:
            oov[word] = vocabulary[word] if isinstance(vocabulary,
                                                       Dict) else word
            oov_words += vocabulary[word] if isinstance(vocabulary,
                                                        Dict) else word

    found_vocab_vectors = len(cov) / len(vocabulary)
    found_vocab_all_text = covered_words / (covered_words + oov_words)
    print('Found embeddings for {:.2%} of vocab'.format(found_vocab_vectors))
    print('Found embeddings for  {:.2%} of all text'.format(
        found_vocab_all_text))
    sorted_oov = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_oov