예제 #1
0
    def init_ngrams(self, update=False):
        if not update:
            self.wv.ngrams = {}
            self.wv.syn0_vocab = empty((len(self.wv.vocab), self.vector_size),
                                       dtype=REAL)
            self.syn0_vocab_lockf = ones(
                (len(self.wv.vocab), self.vector_size), dtype=REAL)

            self.wv.syn0_ngrams = empty((self.bucket, self.vector_size),
                                        dtype=REAL)
            self.syn0_ngrams_lockf = ones((self.bucket, self.vector_size),
                                          dtype=REAL)

            all_ngrams = []
            for w, v in self.wv.vocab.items():
                self.wv.ngrams_word[w] = compute_ngrams(
                    w, self.min_n, self.max_n)
                all_ngrams += self.wv.ngrams_word[w]

            all_ngrams = list(set(all_ngrams))
            self.num_ngram_vectors = len(all_ngrams)
            logger.info("Total number of ngrams is %d", len(all_ngrams))

            self.wv.hash2index = {}
            ngram_indices = []
            new_hash_count = 0
            for i, ngram in enumerate(all_ngrams):
                ngram_hash = ft_hash(ngram)
                if ngram_hash in self.wv.hash2index:
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
                else:
                    ngram_indices.append(ngram_hash % self.bucket)
                    self.wv.hash2index[ngram_hash] = new_hash_count
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
                    new_hash_count = new_hash_count + 1

            self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices,
                                                           axis=0)
            self.syn0_ngrams_lockf = self.syn0_ngrams_lockf.take(ngram_indices,
                                                                 axis=0)
            self.reset_ngram_weights()
        else:
            new_ngrams = []
            for w, v in self.wv.vocab.items():
                self.wv.ngrams_word[w] = compute_ngrams(
                    w, self.min_n, self.max_n)
                new_ngrams += [
                    ng for ng in self.wv.ngrams_word[w]
                    if ng not in self.wv.ngrams
                ]

            new_ngrams = list(set(new_ngrams))
            logger.info("Number of new ngrams is %d", len(new_ngrams))
            new_hash_count = 0
            for i, ngram in enumerate(new_ngrams):
                ngram_hash = ft_hash(ngram)
                if ngram_hash not in self.wv.hash2index:
                    self.wv.hash2index[
                        ngram_hash] = new_hash_count + self.old_hash2index_len
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
                    new_hash_count = new_hash_count + 1
                else:
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]

            rand_obj = np.random
            rand_obj.seed(self.seed)
            new_vocab_rows = rand_obj.uniform(
                -1.0 / self.vector_size, 1.0 / self.vector_size,
                (len(self.wv.vocab) - self.old_vocab_len, self.vector_size))
            new_vocab_lockf_rows = ones(
                (len(self.wv.vocab) - self.old_vocab_len, self.vector_size),
                dtype=REAL)
            new_ngram_rows = rand_obj.uniform(
                -1.0 / self.vector_size, 1.0 / self.vector_size,
                (len(self.wv.hash2index) - self.old_hash2index_len,
                 self.vector_size))
            new_ngram_lockf_rows = ones(
                (len(self.wv.hash2index) - self.old_hash2index_len,
                 self.vector_size),
                dtype=REAL)

            self.wv.syn0_vocab = vstack([self.wv.syn0_vocab, new_vocab_rows])
            self.syn0_vocab_lockf = vstack(
                [self.syn0_vocab_lockf, new_vocab_lockf_rows])
            self.wv.syn0_ngrams = vstack([self.wv.syn0_ngrams, new_ngram_rows])
            self.syn0_ngrams_lockf = vstack(
                [self.syn0_ngrams_lockf, new_ngram_lockf_rows])
 def testHash(self):
     # Tests FastText.ft_hash method return values to those obtained from original C implementation
     ft_hash = fasttext.ft_hash('test')
     self.assertEqual(ft_hash, 2949673445)
     ft_hash = fasttext.ft_hash('word')
     self.assertEqual(ft_hash, 1788406269)
예제 #3
0
    def init_ngrams(self, update=False):
        if not update:
            self.wv.ngrams = {}
            self.wv.syn0_vocab = empty((len(self.wv.vocab), self.vector_size), dtype=REAL)
            self.syn0_vocab_lockf = ones((len(self.wv.vocab), self.vector_size), dtype=REAL)

            self.wv.syn0_ngrams = empty((self.bucket, self.vector_size), dtype=REAL)
            self.syn0_ngrams_lockf = ones((self.bucket, self.vector_size), dtype=REAL)

            all_ngrams = []
            for w, v in self.wv.vocab.items():
                self.wv.ngrams_word[w] = compute_ngrams(w, self.min_n, self.max_n)
                all_ngrams += self.wv.ngrams_word[w]

            all_ngrams = list(set(all_ngrams))
            self.num_ngram_vectors = len(all_ngrams)
            logger.info("Total number of ngrams is %d", len(all_ngrams))

            self.wv.hash2index = {}
            ngram_indices = []
            new_hash_count = 0
            for i, ngram in enumerate(all_ngrams):
                ngram_hash = ft_hash(ngram)
                if ngram_hash in self.wv.hash2index:
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
                else:
                    ngram_indices.append(ngram_hash % self.bucket)
                    self.wv.hash2index[ngram_hash] = new_hash_count
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
                    new_hash_count = new_hash_count + 1

            self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices, axis=0)
            self.syn0_ngrams_lockf = self.syn0_ngrams_lockf.take(ngram_indices, axis=0)
            self.reset_ngram_weights()
        else:
            new_ngrams = []
            for w, v in self.wv.vocab.items():
                self.wv.ngrams_word[w] = compute_ngrams(w, self.min_n, self.max_n)
                new_ngrams += [ng for ng in self.wv.ngrams_word[w] if ng not in self.wv.ngrams]

            new_ngrams = list(set(new_ngrams))
            logger.info("Number of new ngrams is %d", len(new_ngrams))
            new_hash_count = 0
            for i, ngram in enumerate(new_ngrams):
                ngram_hash = ft_hash(ngram)
                if ngram_hash not in self.wv.hash2index:
                    self.wv.hash2index[ngram_hash] = new_hash_count + self.old_hash2index_len
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
                    new_hash_count = new_hash_count + 1
                else:
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]

            rand_obj = np.random
            rand_obj.seed(self.seed)
            new_vocab_rows = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.vocab) - self.old_vocab_len, self.vector_size))
            new_vocab_lockf_rows = ones((len(self.wv.vocab) - self.old_vocab_len, self.vector_size), dtype=REAL)
            new_ngram_rows = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size))
            new_ngram_lockf_rows = ones((len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size), dtype=REAL)

            self.wv.syn0_vocab = vstack([self.wv.syn0_vocab, new_vocab_rows])
            self.syn0_vocab_lockf = vstack([self.syn0_vocab_lockf, new_vocab_lockf_rows])
            self.wv.syn0_ngrams = vstack([self.wv.syn0_ngrams, new_ngram_rows])
            self.syn0_ngrams_lockf = vstack([self.syn0_ngrams_lockf, new_ngram_lockf_rows])
    def init_ngrams(self, update=False):
        """Compute ngrams of all words present in vocabulary and stores vectors for only those ngrams.
        Vectors for other ngrams are initialized with a random uniform distribution in FastText.

        Parameters
        ----------
        update : bool
            If True, the new vocab words and their new ngrams word vectors are initialized
            with random uniform distribution and updated/added to the existing vocab word and ngram vectors.

        """
        if not update:
            self.wv.ngrams = {}
            self.wv.syn0_vocab = empty((len(self.wv.vocab), self.vector_size),
                                       dtype=REAL)
            self.syn0_vocab_lockf = ones(
                (len(self.wv.vocab), self.vector_size), dtype=REAL)

            self.wv.syn0_ngrams = empty((self.bucket, self.vector_size),
                                        dtype=REAL)
            self.syn0_ngrams_lockf = ones((self.bucket, self.vector_size),
                                          dtype=REAL)

            all_ngrams = []
            for w, v in self.wv.vocab.items():
                self.wv.ngrams_word[w] = compute_ngrams(
                    w, self.min_n, self.max_n)
                all_ngrams += self.wv.ngrams_word[w]

            all_ngrams = list(set(all_ngrams))
            self.num_ngram_vectors = len(all_ngrams)
            logger.info("Total number of ngrams is %d", len(all_ngrams))

            self.wv.hash2index = {}
            ngram_indices = []
            new_hash_count = 0
            for i, ngram in enumerate(all_ngrams):
                ngram_hash = ft_hash(ngram) % self.bucket
                if ngram_hash in self.wv.hash2index:
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
                else:
                    ngram_indices.append(ngram_hash % self.bucket)
                    self.wv.hash2index[ngram_hash] = new_hash_count
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
                    new_hash_count = new_hash_count + 1

            self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices,
                                                           axis=0)
            self.syn0_ngrams_lockf = self.syn0_ngrams_lockf.take(ngram_indices,
                                                                 axis=0)
            self.reset_ngram_weights()
        else:
            new_ngrams = []
            for w, v in self.wv.vocab.items():
                self.wv.ngrams_word[w] = compute_ngrams(
                    w, self.min_n, self.max_n)
                new_ngrams += [
                    ng for ng in self.wv.ngrams_word[w]
                    if ng not in self.wv.ngrams
                ]

            new_ngrams = list(set(new_ngrams))
            logger.info("Number of new ngrams is %d", len(new_ngrams))
            new_hash_count = 0
            for i, ngram in enumerate(new_ngrams):
                ngram_hash = ft_hash(ngram) % self.bucket
                if ngram_hash not in self.wv.hash2index:
                    self.wv.hash2index[
                        ngram_hash] = new_hash_count + self.old_hash2index_len
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
                    new_hash_count = new_hash_count + 1
                else:
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]

            rand_obj = np.random
            rand_obj.seed(self.seed)
            new_vocab_rows = rand_obj.uniform(
                -1.0 / self.vector_size, 1.0 / self.vector_size,
                (len(self.wv.vocab) - self.old_vocab_len,
                 self.vector_size)).astype(REAL)
            new_vocab_lockf_rows = ones(
                (len(self.wv.vocab) - self.old_vocab_len, self.vector_size),
                dtype=REAL)
            new_ngram_rows = rand_obj.uniform(
                -1.0 / self.vector_size, 1.0 / self.vector_size,
                (len(self.wv.hash2index) - self.old_hash2index_len,
                 self.vector_size)).astype(REAL)
            new_ngram_lockf_rows = ones(
                (len(self.wv.hash2index) - self.old_hash2index_len,
                 self.vector_size),
                dtype=REAL)

            self.wv.syn0_vocab = vstack([self.wv.syn0_vocab, new_vocab_rows])
            self.syn0_vocab_lockf = vstack(
                [self.syn0_vocab_lockf, new_vocab_lockf_rows])
            self.wv.syn0_ngrams = vstack([self.wv.syn0_ngrams, new_ngram_rows])
            self.syn0_ngrams_lockf = vstack(
                [self.syn0_ngrams_lockf, new_ngram_lockf_rows])
예제 #5
0
 def testHash(self):
     # Tests FastText.ft_hash method return values to those obtained from original C implementation
     ft_hash = fasttext.ft_hash('test')
     self.assertEqual(ft_hash, 2949673445)
     ft_hash = fasttext.ft_hash('word')
     self.assertEqual(ft_hash, 1788406269)