def build_ngrams(self, wv, update=False): if not update: wv.ngrams_word = {} for w, v in iteritems(wv.vocab): wv.ngrams_word[w] = _compute_ngrams(w, wv.min_n, wv.max_n) else: for w, v in iteritems(wv.vocab): wv.ngrams_word[w] = _compute_ngrams(w, wv.min_n, wv.max_n)
def estimate_memory(self, vocab_size=None, report=None): vocab_size = vocab_size or len(self.wv.vocab) vec_size = self.vector_size * np.dtype(np.float32).itemsize l1_size = self.layer1_size * np.dtype(np.float32).itemsize report = report or {} report['vocab'] = len(self.wv.vocab) * (700 if self.hs else 500) report['syn0_vocab'] = len(self.wv.vocab) * vec_size num_buckets = self.bucket if self.hs: report['syn1'] = len(self.wv.vocab) * l1_size if self.negative: report['syn1neg'] = len(self.wv.vocab) * l1_size if self.word_ngrams > 0 and self.wv.vocab: buckets = set() num_ngrams = 0 for word in self.wv.vocab: ngrams = _compute_ngrams(word, self.min_n, self.max_n) num_ngrams += len(ngrams) buckets.update(_ft_hash(ng) % self.bucket for ng in ngrams) num_buckets = len(buckets) report['syn0_ngrams'] = len(buckets) * vec_size # A tuple (48 bytes) with num_ngrams_word ints (8 bytes) for each word # Only used during training, not stored with the model report['buckets_word'] = 48 * len(self.wv.vocab) + 8 * num_ngrams elif self.word_ngrams > 0: logger.warn( 'subword information is enabled, but no vocabulary could be found, estimated required memory might be ' 'inaccurate!' ) report['total'] = sum(report.values()) logger.info( "estimated required memory for %i words, %i buckets and %i dimensions: %i bytes", len(self.wv.vocab), num_buckets, self.vector_size, report['total'] ) return report
def get_vocab_word_vecs(self, wv): """Calculate vectors for words in vocabulary and stores them in `vectors`.""" for w, v in wv.vocab.items(): word_vec = np.copy(wv.vectors_vocab[v.index]) ngrams = _compute_ngrams(w, wv.min_n, wv.max_n) ngram_weights = wv.vectors_ngrams for ngram in ngrams: word_vec += ngram_weights[wv.hash2index[_ft_hash(ngram) % self.bucket]] word_vec /= (len(ngrams) + 1) wv.vectors[v.index] = word_vec
def init_ngrams_post_load(self, file_name, wv): """Compute ngrams of all words present in vocabulary, and store vectors for only those ngrams. Vectors for other ngrams are initialized with a random uniform distribution in FastText. These vectors are discarded here to save space. """ wv.vectors = np.zeros((len(wv.vocab), wv.vector_size), dtype=REAL) for w, vocab in wv.vocab.items(): wv.vectors[vocab.index] += np.array(wv.vectors_ngrams[vocab.index]) ngram_indices = [] wv.num_ngram_vectors = 0 for word in wv.vocab.keys(): for ngram in _compute_ngrams(word, wv.min_n, wv.max_n): ngram_hash = _ft_hash(ngram) % self.bucket if ngram_hash in wv.hash2index: continue wv.hash2index[ngram_hash] = len(ngram_indices) ngram_indices.append(len(wv.vocab) + ngram_hash) wv.num_ngram_vectors = len(ngram_indices) wv.vectors_ngrams = wv.vectors_ngrams.take(ngram_indices, axis=0) ngram_weights = wv.vectors_ngrams logger.info( "loading weights for %s words for fastText model from %s", len(wv.vocab), file_name ) for w, vocab in wv.vocab.items(): word_ngrams = _compute_ngrams(w, wv.min_n, wv.max_n) for word_ngram in word_ngrams: vec_idx = wv.hash2index[_ft_hash(word_ngram) % self.bucket] wv.vectors[vocab.index] += np.array(ngram_weights[vec_idx]) wv.vectors[vocab.index] /= (len(word_ngrams) + 1) logger.info( "loaded %s weight matrix for fastText model from %s", wv.vectors.shape, file_name )
def word2ngram(sentences,n): """ 将句子表示成 ngram 形式 """ for sentence in sentences: ngram = [] for word in sentence: l1 = _compute_ngrams(word, n, n) ngram.extend(l1) yield ngram
def init_ngrams_post_load(self, file_name, wv): """ Computes ngrams of all words present in vocabulary and stores vectors for only those ngrams. Vectors for other ngrams are initialized with a random uniform distribution in FastText. These vectors are discarded here to save space. """ all_ngrams = [] wv.vectors = np.zeros((len(wv.vocab), wv.vector_size), dtype=REAL) for w, vocab in wv.vocab.items(): all_ngrams += _compute_ngrams(w, wv.min_n, wv.max_n) wv.vectors[vocab.index] += np.array(wv.vectors_ngrams[vocab.index]) all_ngrams = set(all_ngrams) wv.num_ngram_vectors = len(all_ngrams) ngram_indices = [] for i, ngram in enumerate(all_ngrams): ngram_hash = _ft_hash(ngram) ngram_indices.append(len(wv.vocab) + ngram_hash % self.bucket) wv.ngrams[ngram] = i wv.vectors_ngrams = wv.vectors_ngrams.take(ngram_indices, axis=0) ngram_weights = wv.vectors_ngrams logger.info( "loading weights for %s words for fastText model from %s", len(wv.vocab), file_name ) for w, vocab in wv.vocab.items(): word_ngrams = _compute_ngrams(w, wv.min_n, wv.max_n) for word_ngram in word_ngrams: wv.vectors[vocab.index] += np.array(ngram_weights[wv.ngrams[word_ngram]]) wv.vectors[vocab.index] /= (len(word_ngrams) + 1) logger.info( "loaded %s weight matrix for fastText model from %s", wv.vectors.shape, file_name )
def get_vector_ngram(word): word_vec = np.zeros(model.wv.vectors_ngrams.shape[1], dtype=np.float32) ngrams = _compute_ngrams(word, model.wv.min_n, model.wv.max_n) ngrams_found = 0 for ngram in ngrams: ngram_hash = _ft_hash(ngram) % model.wv.bucket if ngram_hash in model.wv.hash2index: word_vec += model.wv.vectors_ngrams_norm[model.wv.hash2index[ngram_hash]] ngrams_found += 1 if word_vec.any(): return word_vec / max(1, ngrams_found)
def init_ngrams_post_load(self, file_name, wv): """ Computes ngrams of all words present in vocabulary and stores vectors for only those ngrams. Vectors for other ngrams are initialized with a random uniform distribution in FastText. These vectors are discarded here to save space. """ all_ngrams = [] wv.vectors = np.zeros((len(wv.vocab), wv.vector_size), dtype=REAL) for w, vocab in wv.vocab.items(): all_ngrams += _compute_ngrams(w, wv.min_n, wv.max_n) wv.vectors[vocab.index] += np.array(wv.vectors_ngrams[vocab.index]) all_ngrams = set(all_ngrams) wv.num_ngram_vectors = len(all_ngrams) ngram_indices = [] for i, ngram in enumerate(all_ngrams): ngram_hash = _ft_hash(ngram) ngram_indices.append(len(wv.vocab) + ngram_hash % self.bucket) wv.ngrams[ngram] = i wv.vectors_ngrams = wv.vectors_ngrams.take(ngram_indices, axis=0) ngram_weights = wv.vectors_ngrams logger.info("loading weights for %s words for fastText model from %s", len(wv.vocab), file_name) for w, vocab in wv.vocab.items(): word_ngrams = _compute_ngrams(w, wv.min_n, wv.max_n) for word_ngram in word_ngrams: wv.vectors[vocab.index] += np.array( ngram_weights[wv.ngrams[word_ngram]]) wv.vectors[vocab.index] /= (len(word_ngrams) + 1) logger.info("loaded %s weight matrix for fastText model from %s", wv.vectors.shape, file_name)
def ftext_extract_ngrams(model, compressed=True): """ Create a file containing all ngrams and their vectors """ ngram_freq = defaultdict(int) for w, v in model.wv.vocab.items(): for ng in _compute_ngrams(w, model.wv.min_n, model.wv.max_n): ngram_freq[ng] += v.count for ng, f in ngram_freq.items(): pass if compressed: np.savez_compressed(file='model_ngrams.npz', vectors=model.wv.vectors_ngrams, ngrams=np.array(list(ngram_freq.keys()))) else: ng_indexes = ['' for _ in range(len(model.wv.vectors_ngrams))] for ng, v in ngram_freq.items(): ng_hash = model.wv.hash2index.get(_ft_hash(ng) % model.wv.bucket) if ng_hash and \ (ng_indexes[ng_hash] == '' or ngram_freq[ng_indexes[ng_hash]] < ngram_freq[ng]): ng_indexes[ng_hash] = ng d = pd.DataFrame(model.wv.vectors_ngrams, index=ng_indexes) d.to_csv('model_ngrams.csv.gz', compression='gzip', index_label="ngram")
def init_ngrams_weights(self, wv, update=False, vocabulary=None): """Compute ngrams of all words present in vocabulary and stores vectors for only those ngrams. Vectors for other ngrams are initialized with a random uniform distribution in FastText. Parameters ---------- update : bool If True, the new vocab words and their new ngrams word vectors are initialized with random uniform distribution and updated/added to the existing vocab word and ngram vectors. """ if not update: wv.vectors_vocab = empty((len(wv.vocab), wv.vector_size), dtype=REAL) self.vectors_vocab_lockf = ones((len(wv.vocab), wv.vector_size), dtype=REAL) wv.vectors_ngrams = empty((self.bucket, wv.vector_size), dtype=REAL) self.vectors_ngrams_lockf = ones((self.bucket, wv.vector_size), dtype=REAL) wv.hash2index = {} wv.buckets_word = {} ngram_indices = [] for word, vocab in wv.vocab.items(): buckets = [] for ngram in _compute_ngrams(word, wv.min_n, wv.max_n): ngram_hash = _ft_hash(ngram) % self.bucket if ngram_hash not in wv.hash2index: wv.hash2index[ngram_hash] = len(ngram_indices) ngram_indices.append(ngram_hash) buckets.append(wv.hash2index[ngram_hash]) wv.buckets_word[vocab.index] = tuple(buckets) wv.num_ngram_vectors = len(ngram_indices) logger.info("Total number of ngrams is %d", wv.num_ngram_vectors) wv.vectors_ngrams = wv.vectors_ngrams.take(ngram_indices, axis=0) self.vectors_ngrams_lockf = self.vectors_ngrams_lockf.take(ngram_indices, axis=0) self.reset_ngrams_weights(wv) else: wv.buckets_word = {} num_new_ngrams = 0 for word, vocab in wv.vocab.items(): buckets = [] for ngram in _compute_ngrams(word, wv.min_n, wv.max_n): ngram_hash = _ft_hash(ngram) % self.bucket if ngram_hash not in wv.hash2index: wv.hash2index[ngram_hash] = num_new_ngrams + self.old_hash2index_len num_new_ngrams += 1 buckets.append(wv.hash2index[ngram_hash]) wv.buckets_word[vocab.index] = tuple(buckets) wv.num_ngram_vectors += num_new_ngrams logger.info("Number of new ngrams is %d", num_new_ngrams) rand_obj = np.random rand_obj.seed(self.seed) new_vocab_rows = rand_obj.uniform( -1.0 / wv.vector_size, 1.0 / wv.vector_size, (len(wv.vocab) - vocabulary.old_vocab_len, wv.vector_size) ).astype(REAL) new_vocab_lockf_rows = ones( (len(wv.vocab) - vocabulary.old_vocab_len, wv.vector_size), dtype=REAL) new_ngram_rows = rand_obj.uniform( -1.0 / wv.vector_size, 1.0 / wv.vector_size, (len(wv.hash2index) - self.old_hash2index_len, wv.vector_size) ).astype(REAL) new_ngram_lockf_rows = ones( (len(wv.hash2index) - self.old_hash2index_len, wv.vector_size), dtype=REAL) wv.vectors_vocab = vstack([wv.vectors_vocab, new_vocab_rows]) self.vectors_vocab_lockf = vstack([self.vectors_vocab_lockf, new_vocab_lockf_rows]) wv.vectors_ngrams = vstack([wv.vectors_ngrams, new_ngram_rows]) self.vectors_ngrams_lockf = vstack([self.vectors_ngrams_lockf, new_ngram_lockf_rows])
def loadModelandPCA(modelName=modelSelect.value): """ Load selected model from the selection box and applies PCA on the first 'number_of_elements' word vectors """ global model global vectors global words global ngrams global vectors_ngrams global words_ngrams global source global sourceTSNE global sourceNetwork LoadingDiv.css_classes = ["loading"] model = FastText.load(modelName) # model = FastText.load_fasttext_format('PRe_git/model/'+modelName+'.bin', encoding='ISO-8859-15') ## model = fasttext.load_model('PRe/model/model.bin') print("Data loaded ...") # vectors = [list(line) for line in data.values()] # words = list(data) vectors = model.wv.syn0 words = model.wv.index2word ngrams = [] vectors_ngrams = [] words_ngrams = [] for word in words: ngrams += _compute_ngrams(word, model.min_n, model.max_n) ngrams = set(ngrams) print('Ngrams done ...') i=0 for ngram in ngrams: ngram_hash = _ft_hash(ngram) % model.bucket if ngram_hash in model.wv.hash2index: i += 1 words_ngrams.append(ngram) vectors_ngrams.append(model.wv.vectors_ngrams[model.wv.hash2index[ngram_hash]]) gr = _compute_ngrams(words[466], model.min_n, model.max_n) for g in gr: print(words[466], g, model.wv.similarity(words[466],g)) NumberElementsDiv.text = "Nombre total de mots : "+str(len(words)) d3.text = "<h2>Visualisation globale des repr\u00E9sentations</h2><br><h3>Vecteurs de dimension "+str(len(vectors[0]))+" projet\u00E9s dans le plan selon :</h3>" # PCA pcaProcess(modelName) sourceTSNE.data['x'] = [0 for i in range(0,number_of_elements)] sourceTSNE.data['y'] = [0 for i in range(0,number_of_elements)] sourceTSNE.data['mots'] = words[0:number_of_elements] sourceTSNE.data['color'] = ['#053061' for i in range(0,number_of_elements)] sourceTSNE.selected.indices = [] sourceNetwork.data['label'] = [] sourceNetwork.data['edges'] = [] sourceNetwork.data['values'] = [] sourceNetwork.data['index'] = [] sourceNetwork.data['color'] = [] print("Source done ...") source.trigger('data', None, source) sourceTSNE.trigger('data', None, sourceTSNE) sourceNetwork.trigger('data', None, sourceNetwork) LoadingDiv.css_classes = []