예제 #1
0
    def compact_word_vectors(self,
                             vocab,
                             filename=None,
                             array=None,
                             top=20000):
        """
        Retrieve pretrained word vectors for our vocabulary.
        The returned word array has row indices corresponding to the
        compact index of a word, and columns corresponding to the word
        vector.

        Examples
        --------
        >>> import numpy.linalg as nl
        >>> vocab = {19: 'shuttle', 5: 'astronomy', 7: 'cold', 3: 'hot'}
        >>> word_indices = np.zeros(50).astype('int32')
        >>> word_indices[:25] = 19  # 'Shuttle' shows 25 times
        >>> word_indices[25:35] = 5  # 'astronomy' is in 10 times
        >>> word_indices[40:46] = 7  # 'cold' is in 6 times
        >>> word_indices[46:] = 3  # 'hot' is in 3 times
        >>> corpus = Corpus()
        >>> corpus.update_word_count(word_indices)
        >>> corpus.finalize()
        >>> v, s, f = corpus.compact_word_vectors(vocab)
        >>> sim = lambda x, y: np.dot(x, y) / nl.norm(x) / nl.norm(y)
        >>> vocab[corpus.compact_to_loose[2]]
        'shuttle'
        >>> vocab[corpus.compact_to_loose[3]]
        'astronomy'
        >>> vocab[corpus.compact_to_loose[4]]
        'cold'
        >>> sim_shuttle_astro = sim(v[2, :], v[3, :])
        >>> sim_shuttle_cold = sim(v[2, :], v[4, :])
        >>> sim_shuttle_astro > sim_shuttle_cold
        True

        :param vocab: (dict) Dictionary where keys are the loose index, and values are
            the word string.
        :param filename: (str) Filename for word2vec vectors via gensim.
        :param array:
        :param top:
        :return:
            data: (ndarray[float]) Array such that data[compact_index, :] = word_vector
        """
        n_words = len(self.compact_to_loose)

        from gensim.models.word2vec import Word2Vec

        model = Word2Vec.load_word2vec_format(filename, binary=True)
        n_dim = model.syn0.shape[1]
        data = np.random.normal(size=(n_words, n_dim)).astype('float32')
        data -= data.mean()
        data += model.syn0.mean()
        data /= data.std()
        data *= model.syn0.std()
        if array is not None:
            data = array
            # n_words = data.shape[0]

        keys_raw = model.vocab.keys()
        keys = [s.encode('ascii', 'ignore') for s in keys_raw]
        lens = [len(s) for s in model.vocab.keys()]
        choices = np.array(keys, dtype='S')
        lengths = np.array(lens, dtype='int32')
        s, f = 0, 0
        rep0 = lambda w: w
        rep1 = lambda w: w.replace(' ', '_')
        rep2 = lambda w: w.title().replace(' ', '_')
        reps = [rep0, rep1, rep2]
        for compact in np.arange(top):
            loose = self.compact_to_loose.get(compact, None)
            if loose is None:
                continue

            word = vocab.get(loose, None)
            if word is None:
                continue

            word = word.strip()
            vector = None
            for rep in reps:
                clean = rep(word)
                if clean in model.vocab:
                    vector = model[clean]
                    break

            if vector is None:
                try:
                    word = str(word)
                    idx = lengths >= len(word) - 3
                    idx &= lengths <= len(word) + 3
                    sel = choices[idx]
                    d = damerau_levenshtein_distance_withNPArray(word, sel)
                    choice = np.array(keys_raw)[idx][np.argmin(d)]
                    vector = model[choice]
                    print(compact, word, ' --> ', choice)
                except IndexError:
                    pass

            if vector is None:
                f += 1
                continue

            s += 1
            data[compact, :] = vector[:]

        return data, s, f
예제 #2
0
    def compact_word_vectors(self, vocab, filename=None, array=None,
                             top=20000):
        """ Retrieve pretrained word spectors for our vocabulary.
        The returned word array has row indices corresponding to the
        compact index of a word, and columns correponding to the word
        vector.

        Arguments
        ---------
        vocab : dict
            Dictionary where keys are the loose index, and values are
            the word string.

        use_spacy : bool
            Use SpaCy to load in word vectors. Otherwise Gensim.

        filename : str
            Filename for SpaCy-compatible word vectors or if use_spacy=False
            then uses word2vec vectors via gensim.

        Returns
        -------
        data : numpy float array
            Array such that data[compact_index, :] = word_vector

        Examples
        --------
        >>> import numpy.linalg as nl
        >>> vocab = {19: 'shuttle', 5: 'astronomy', 7: 'cold', 3: 'hot'}
        >>> word_indices = np.zeros(50).astype('int32')
        >>> word_indices[:25] = 19  # 'Shuttle' shows 25 times
        >>> word_indices[25:35] = 5  # 'astronomy' is in 10 times
        >>> word_indices[40:46] = 7  # 'cold' is in 6 times
        >>> word_indices[46:] = 3  # 'hot' is in 3 times
        >>> corpus = Corpus()
        >>> corpus.update_word_count(word_indices)
        >>> corpus.finalize()
        >>> v, s, f = corpus.compact_word_vectors(vocab)
        >>> sim = lambda x, y: np.dot(x, y) / nl.norm(x) / nl.norm(y)
        >>> vocab[corpus.compact_to_loose[2]]
        'shuttle'
        >>> vocab[corpus.compact_to_loose[3]]
        'astronomy'
        >>> vocab[corpus.compact_to_loose[4]]
        'cold'
        >>> sim_shuttle_astro = sim(v[2, :], v[3, :])
        >>> sim_shuttle_cold = sim(v[2, :], v[4, :])
        >>> sim_shuttle_astro > sim_shuttle_cold
        True
        """
        n_words = len(self.compact_to_loose)
        from gensim.models.word2vec import Word2Vec
        model = Word2Vec.load_word2vec_format(filename, binary=True)
        n_dim = model.syn0.shape[1]
        data = np.random.normal(size=(n_words, n_dim)).astype('float32')
        data -= data.mean()
        data += model.syn0.mean()
        data /= data.std()
        data *= model.syn0.std()
        if array is not None:
            data = array
            n_words = data.shape[0]
        keys_raw = model.vocab.keys()
        keys = [s.encode('ascii', 'ignore') for s in keys_raw]
        lens = [len(s) for s in model.vocab.keys()]
        choices = np.array(keys, dtype='S')
        lengths = np.array(lens, dtype='int32')
        s, f = 0, 0
        rep0 = lambda w: w
        rep1 = lambda w: w.replace(' ', '_')
        rep2 = lambda w: w.title().replace(' ', '_')
        reps = [rep0, rep1, rep2]
        for compact in np.arange(top):
            loose = self.compact_to_loose.get(compact, None)
            if loose is None:
                continue
            word = vocab.get(loose, None)
            if word is None:
                continue
            word = word.strip()
            vector = None
            for rep in reps:
                clean = rep(word)
                if clean in model.vocab:
                    vector = model[clean]
                    break
            if vector is None:
                try:
                    word = unicode(word)
                    idx = lengths >= len(word) - 3
                    idx &= lengths <= len(word) + 3
                    sel = choices[idx]
                    d = damerau_levenshtein_distance_withNPArray(word, sel)
                    choice = np.array(keys_raw)[idx][np.argmin(d)]
                    # choice = difflib.get_close_matches(word, choices)[0]
                    vector = model[choice]
                    print compact, word, ' --> ', choice
                except IndexError:
                    pass
            if vector is None:
                f += 1
                continue
            s += 1
            data[compact, :] = vector[:]
        return data, s, f
예제 #3
0
print('#normalized edit distances (low ratio means words are similar):')
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('smtih', 'smith', normalized_damerau_levenshtein_distance('smtih', 'smith')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('snapple', 'apple', normalized_damerau_levenshtein_distance('snapple', 'apple')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('testing', 'testtn', normalized_damerau_levenshtein_distance('testing', 'testtn')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('saturday', 'sunday', normalized_damerau_levenshtein_distance('saturday', 'sunday')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('Saturday', 'saturday', normalized_damerau_levenshtein_distance('Saturday', 'saturday')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('orange', 'pumpkin', normalized_damerau_levenshtein_distance('orange', 'pumpkin')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f #unicode example\n" % ('Sjöstedt', 'Sjostedt', normalized_damerau_levenshtein_distance('Sjöstedt', 'Sjostedt')))  # unicode example

#
print('#distance from a reference to an array:')
l_arrayLength = 100000
myArray = np.array([generateWord() for i in range(l_arrayLength)], dtype='S')
myRef = generateWord()
startV = time.time()
myRes = damerau_levenshtein_distance_withNPArray(myRef, myArray)
endV = time.time()
startR = time.time()
myExpected = [damerau_levenshtein_distance(myRef, w) for w in myArray]
endR = time.time()
assert(len(myRes) == l_arrayLength)
assert((myRes == myExpected).all())
print("Source \"%s\" against Array[%d]" % (myRef, len(myArray)))
print("Array calculus took %f s against %f s" % (endV - startV, endR - startR))
#
print("")
print('#normalized distance from a reference to an array:')
myRes = normalized_damerau_levenshtein_distance_withNPArray(myRef, myArray)
myExpected = [normalized_damerau_levenshtein_distance(myRef, w) for w in myArray]
assert(len(myRes) == l_arrayLength)
assert((myRes == myExpected).all())