def init_ngrams(self, update=False): if not update: self.wv.ngrams = {} self.wv.syn0_vocab = empty((len(self.wv.vocab), self.vector_size), dtype=REAL) self.syn0_vocab_lockf = ones( (len(self.wv.vocab), self.vector_size), dtype=REAL) self.wv.syn0_ngrams = empty((self.bucket, self.vector_size), dtype=REAL) self.syn0_ngrams_lockf = ones((self.bucket, self.vector_size), dtype=REAL) all_ngrams = [] for w, v in self.wv.vocab.items(): self.wv.ngrams_word[w] = compute_ngrams( w, self.min_n, self.max_n) all_ngrams += self.wv.ngrams_word[w] all_ngrams = list(set(all_ngrams)) self.num_ngram_vectors = len(all_ngrams) logger.info("Total number of ngrams is %d", len(all_ngrams)) self.wv.hash2index = {} ngram_indices = [] new_hash_count = 0 for i, ngram in enumerate(all_ngrams): ngram_hash = ft_hash(ngram) if ngram_hash in self.wv.hash2index: self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] else: ngram_indices.append(ngram_hash % self.bucket) self.wv.hash2index[ngram_hash] = new_hash_count self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] new_hash_count = new_hash_count + 1 self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices, axis=0) self.syn0_ngrams_lockf = self.syn0_ngrams_lockf.take(ngram_indices, axis=0) self.reset_ngram_weights() else: new_ngrams = [] for w, v in self.wv.vocab.items(): self.wv.ngrams_word[w] = compute_ngrams( w, self.min_n, self.max_n) new_ngrams += [ ng for ng in self.wv.ngrams_word[w] if ng not in self.wv.ngrams ] new_ngrams = list(set(new_ngrams)) logger.info("Number of new ngrams is %d", len(new_ngrams)) new_hash_count = 0 for i, ngram in enumerate(new_ngrams): ngram_hash = ft_hash(ngram) if ngram_hash not in self.wv.hash2index: self.wv.hash2index[ ngram_hash] = new_hash_count + self.old_hash2index_len self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] new_hash_count = new_hash_count + 1 else: self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] rand_obj = np.random rand_obj.seed(self.seed) new_vocab_rows = rand_obj.uniform( -1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.vocab) - self.old_vocab_len, self.vector_size)) new_vocab_lockf_rows = ones( (len(self.wv.vocab) - self.old_vocab_len, self.vector_size), dtype=REAL) new_ngram_rows = rand_obj.uniform( -1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size)) new_ngram_lockf_rows = ones( (len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size), dtype=REAL) self.wv.syn0_vocab = vstack([self.wv.syn0_vocab, new_vocab_rows]) self.syn0_vocab_lockf = vstack( [self.syn0_vocab_lockf, new_vocab_lockf_rows]) self.wv.syn0_ngrams = vstack([self.wv.syn0_ngrams, new_ngram_rows]) self.syn0_ngrams_lockf = vstack( [self.syn0_ngrams_lockf, new_ngram_lockf_rows])
def testHash(self): # Tests FastText.ft_hash method return values to those obtained from original C implementation ft_hash = fasttext.ft_hash('test') self.assertEqual(ft_hash, 2949673445) ft_hash = fasttext.ft_hash('word') self.assertEqual(ft_hash, 1788406269)
def init_ngrams(self, update=False): if not update: self.wv.ngrams = {} self.wv.syn0_vocab = empty((len(self.wv.vocab), self.vector_size), dtype=REAL) self.syn0_vocab_lockf = ones((len(self.wv.vocab), self.vector_size), dtype=REAL) self.wv.syn0_ngrams = empty((self.bucket, self.vector_size), dtype=REAL) self.syn0_ngrams_lockf = ones((self.bucket, self.vector_size), dtype=REAL) all_ngrams = [] for w, v in self.wv.vocab.items(): self.wv.ngrams_word[w] = compute_ngrams(w, self.min_n, self.max_n) all_ngrams += self.wv.ngrams_word[w] all_ngrams = list(set(all_ngrams)) self.num_ngram_vectors = len(all_ngrams) logger.info("Total number of ngrams is %d", len(all_ngrams)) self.wv.hash2index = {} ngram_indices = [] new_hash_count = 0 for i, ngram in enumerate(all_ngrams): ngram_hash = ft_hash(ngram) if ngram_hash in self.wv.hash2index: self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] else: ngram_indices.append(ngram_hash % self.bucket) self.wv.hash2index[ngram_hash] = new_hash_count self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] new_hash_count = new_hash_count + 1 self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices, axis=0) self.syn0_ngrams_lockf = self.syn0_ngrams_lockf.take(ngram_indices, axis=0) self.reset_ngram_weights() else: new_ngrams = [] for w, v in self.wv.vocab.items(): self.wv.ngrams_word[w] = compute_ngrams(w, self.min_n, self.max_n) new_ngrams += [ng for ng in self.wv.ngrams_word[w] if ng not in self.wv.ngrams] new_ngrams = list(set(new_ngrams)) logger.info("Number of new ngrams is %d", len(new_ngrams)) new_hash_count = 0 for i, ngram in enumerate(new_ngrams): ngram_hash = ft_hash(ngram) if ngram_hash not in self.wv.hash2index: self.wv.hash2index[ngram_hash] = new_hash_count + self.old_hash2index_len self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] new_hash_count = new_hash_count + 1 else: self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] rand_obj = np.random rand_obj.seed(self.seed) new_vocab_rows = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.vocab) - self.old_vocab_len, self.vector_size)) new_vocab_lockf_rows = ones((len(self.wv.vocab) - self.old_vocab_len, self.vector_size), dtype=REAL) new_ngram_rows = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size)) new_ngram_lockf_rows = ones((len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size), dtype=REAL) self.wv.syn0_vocab = vstack([self.wv.syn0_vocab, new_vocab_rows]) self.syn0_vocab_lockf = vstack([self.syn0_vocab_lockf, new_vocab_lockf_rows]) self.wv.syn0_ngrams = vstack([self.wv.syn0_ngrams, new_ngram_rows]) self.syn0_ngrams_lockf = vstack([self.syn0_ngrams_lockf, new_ngram_lockf_rows])
def init_ngrams(self, update=False): """Compute ngrams of all words present in vocabulary and stores vectors for only those ngrams. Vectors for other ngrams are initialized with a random uniform distribution in FastText. Parameters ---------- update : bool If True, the new vocab words and their new ngrams word vectors are initialized with random uniform distribution and updated/added to the existing vocab word and ngram vectors. """ if not update: self.wv.ngrams = {} self.wv.syn0_vocab = empty((len(self.wv.vocab), self.vector_size), dtype=REAL) self.syn0_vocab_lockf = ones( (len(self.wv.vocab), self.vector_size), dtype=REAL) self.wv.syn0_ngrams = empty((self.bucket, self.vector_size), dtype=REAL) self.syn0_ngrams_lockf = ones((self.bucket, self.vector_size), dtype=REAL) all_ngrams = [] for w, v in self.wv.vocab.items(): self.wv.ngrams_word[w] = compute_ngrams( w, self.min_n, self.max_n) all_ngrams += self.wv.ngrams_word[w] all_ngrams = list(set(all_ngrams)) self.num_ngram_vectors = len(all_ngrams) logger.info("Total number of ngrams is %d", len(all_ngrams)) self.wv.hash2index = {} ngram_indices = [] new_hash_count = 0 for i, ngram in enumerate(all_ngrams): ngram_hash = ft_hash(ngram) % self.bucket if ngram_hash in self.wv.hash2index: self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] else: ngram_indices.append(ngram_hash % self.bucket) self.wv.hash2index[ngram_hash] = new_hash_count self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] new_hash_count = new_hash_count + 1 self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices, axis=0) self.syn0_ngrams_lockf = self.syn0_ngrams_lockf.take(ngram_indices, axis=0) self.reset_ngram_weights() else: new_ngrams = [] for w, v in self.wv.vocab.items(): self.wv.ngrams_word[w] = compute_ngrams( w, self.min_n, self.max_n) new_ngrams += [ ng for ng in self.wv.ngrams_word[w] if ng not in self.wv.ngrams ] new_ngrams = list(set(new_ngrams)) logger.info("Number of new ngrams is %d", len(new_ngrams)) new_hash_count = 0 for i, ngram in enumerate(new_ngrams): ngram_hash = ft_hash(ngram) % self.bucket if ngram_hash not in self.wv.hash2index: self.wv.hash2index[ ngram_hash] = new_hash_count + self.old_hash2index_len self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] new_hash_count = new_hash_count + 1 else: self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] rand_obj = np.random rand_obj.seed(self.seed) new_vocab_rows = rand_obj.uniform( -1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.vocab) - self.old_vocab_len, self.vector_size)).astype(REAL) new_vocab_lockf_rows = ones( (len(self.wv.vocab) - self.old_vocab_len, self.vector_size), dtype=REAL) new_ngram_rows = rand_obj.uniform( -1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size)).astype(REAL) new_ngram_lockf_rows = ones( (len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size), dtype=REAL) self.wv.syn0_vocab = vstack([self.wv.syn0_vocab, new_vocab_rows]) self.syn0_vocab_lockf = vstack( [self.syn0_vocab_lockf, new_vocab_lockf_rows]) self.wv.syn0_ngrams = vstack([self.wv.syn0_ngrams, new_ngram_rows]) self.syn0_ngrams_lockf = vstack( [self.syn0_ngrams_lockf, new_ngram_lockf_rows])