def build_ngrams(self, wv, update=False):
     if not update:
         wv.ngrams_word = {}
         for w, v in iteritems(wv.vocab):
             wv.ngrams_word[w] = _compute_ngrams(w, wv.min_n, wv.max_n)
     else:
         for w, v in iteritems(wv.vocab):
             wv.ngrams_word[w] = _compute_ngrams(w, wv.min_n, wv.max_n)
Пример #2
0
 def build_ngrams(self, wv, update=False):
     if not update:
         wv.ngrams_word = {}
         for w, v in iteritems(wv.vocab):
             wv.ngrams_word[w] = _compute_ngrams(w, wv.min_n, wv.max_n)
     else:
         for w, v in iteritems(wv.vocab):
             wv.ngrams_word[w] = _compute_ngrams(w, wv.min_n, wv.max_n)
Пример #3
0
 def estimate_memory(self, vocab_size=None, report=None):
     vocab_size = vocab_size or len(self.wv.vocab)
     vec_size = self.vector_size * np.dtype(np.float32).itemsize
     l1_size = self.layer1_size * np.dtype(np.float32).itemsize
     report = report or {}
     report['vocab'] = len(self.wv.vocab) * (700 if self.hs else 500)
     report['syn0_vocab'] = len(self.wv.vocab) * vec_size
     num_buckets = self.bucket
     if self.hs:
         report['syn1'] = len(self.wv.vocab) * l1_size
     if self.negative:
         report['syn1neg'] = len(self.wv.vocab) * l1_size
     if self.word_ngrams > 0 and self.wv.vocab:
         buckets = set()
         num_ngrams = 0
         for word in self.wv.vocab:
             ngrams = _compute_ngrams(word, self.min_n, self.max_n)
             num_ngrams += len(ngrams)
             buckets.update(_ft_hash(ng) % self.bucket for ng in ngrams)
         num_buckets = len(buckets)
         report['syn0_ngrams'] = len(buckets) * vec_size
         # A tuple (48 bytes) with num_ngrams_word ints (8 bytes) for each word
         # Only used during training, not stored with the model
         report['buckets_word'] = 48 * len(self.wv.vocab) + 8 * num_ngrams
     elif self.word_ngrams > 0:
         logger.warn(
             'subword information is enabled, but no vocabulary could be found, estimated required memory might be '
             'inaccurate!'
         )
     report['total'] = sum(report.values())
     logger.info(
         "estimated required memory for %i words, %i buckets and %i dimensions: %i bytes",
         len(self.wv.vocab), num_buckets, self.vector_size, report['total']
     )
     return report
Пример #4
0
 def get_vocab_word_vecs(self, wv):
     """Calculate vectors for words in vocabulary and stores them in `vectors`."""
     for w, v in wv.vocab.items():
         word_vec = np.copy(wv.vectors_vocab[v.index])
         ngrams = _compute_ngrams(w, wv.min_n, wv.max_n)
         ngram_weights = wv.vectors_ngrams
         for ngram in ngrams:
             word_vec += ngram_weights[wv.hash2index[_ft_hash(ngram) % self.bucket]]
         word_vec /= (len(ngrams) + 1)
         wv.vectors[v.index] = word_vec
Пример #5
0
    def init_ngrams_post_load(self, file_name, wv):
        """Compute ngrams of all words present in vocabulary, and store vectors for only those ngrams.

        Vectors for other ngrams are initialized with a random uniform distribution in FastText. These
        vectors are discarded here to save space.

        """
        wv.vectors = np.zeros((len(wv.vocab), wv.vector_size), dtype=REAL)

        for w, vocab in wv.vocab.items():
            wv.vectors[vocab.index] += np.array(wv.vectors_ngrams[vocab.index])

        ngram_indices = []
        wv.num_ngram_vectors = 0
        for word in wv.vocab.keys():
            for ngram in _compute_ngrams(word, wv.min_n, wv.max_n):
                ngram_hash = _ft_hash(ngram) % self.bucket
                if ngram_hash in wv.hash2index:
                    continue
                wv.hash2index[ngram_hash] = len(ngram_indices)
                ngram_indices.append(len(wv.vocab) + ngram_hash)
        wv.num_ngram_vectors = len(ngram_indices)
        wv.vectors_ngrams = wv.vectors_ngrams.take(ngram_indices, axis=0)

        ngram_weights = wv.vectors_ngrams

        logger.info(
            "loading weights for %s words for fastText model from %s",
            len(wv.vocab), file_name
        )

        for w, vocab in wv.vocab.items():
            word_ngrams = _compute_ngrams(w, wv.min_n, wv.max_n)
            for word_ngram in word_ngrams:
                vec_idx = wv.hash2index[_ft_hash(word_ngram) % self.bucket]
                wv.vectors[vocab.index] += np.array(ngram_weights[vec_idx])

            wv.vectors[vocab.index] /= (len(word_ngrams) + 1)
        logger.info(
            "loaded %s weight matrix for fastText model from %s",
            wv.vectors.shape, file_name
        )
Пример #6
0
def word2ngram(sentences,n):
    """
   将句子表示成 ngram 形式

    """
    for sentence in sentences:
        ngram = []
        for word in sentence:
            l1 = _compute_ngrams(word, n, n)
            ngram.extend(l1)
        yield ngram
Пример #7
0
    def init_ngrams_post_load(self, file_name, wv):
        """
        Computes ngrams of all words present in vocabulary and stores vectors for only those ngrams.
        Vectors for other ngrams are initialized with a random uniform distribution in FastText. These
        vectors are discarded here to save space.

        """
        all_ngrams = []
        wv.vectors = np.zeros((len(wv.vocab), wv.vector_size), dtype=REAL)

        for w, vocab in wv.vocab.items():
            all_ngrams += _compute_ngrams(w, wv.min_n, wv.max_n)
            wv.vectors[vocab.index] += np.array(wv.vectors_ngrams[vocab.index])

        all_ngrams = set(all_ngrams)
        wv.num_ngram_vectors = len(all_ngrams)
        ngram_indices = []
        for i, ngram in enumerate(all_ngrams):
            ngram_hash = _ft_hash(ngram)
            ngram_indices.append(len(wv.vocab) + ngram_hash % self.bucket)
            wv.ngrams[ngram] = i
        wv.vectors_ngrams = wv.vectors_ngrams.take(ngram_indices, axis=0)

        ngram_weights = wv.vectors_ngrams

        logger.info(
            "loading weights for %s words for fastText model from %s",
            len(wv.vocab), file_name
        )

        for w, vocab in wv.vocab.items():
            word_ngrams = _compute_ngrams(w, wv.min_n, wv.max_n)
            for word_ngram in word_ngrams:
                wv.vectors[vocab.index] += np.array(ngram_weights[wv.ngrams[word_ngram]])

            wv.vectors[vocab.index] /= (len(word_ngrams) + 1)
        logger.info(
            "loaded %s weight matrix for fastText model from %s",
            wv.vectors.shape, file_name
        )
Пример #8
0
def get_vector_ngram(word):
    word_vec = np.zeros(model.wv.vectors_ngrams.shape[1], dtype=np.float32)
  
    ngrams = _compute_ngrams(word, model.wv.min_n, model.wv.max_n)
    ngrams_found = 0
    
    for ngram in ngrams:
        ngram_hash = _ft_hash(ngram) % model.wv.bucket
        if ngram_hash in model.wv.hash2index:
            word_vec += model.wv.vectors_ngrams_norm[model.wv.hash2index[ngram_hash]]
        
            ngrams_found += 1
    if word_vec.any():
        return word_vec / max(1, ngrams_found)
    def init_ngrams_post_load(self, file_name, wv):
        """
        Computes ngrams of all words present in vocabulary and stores vectors for only those ngrams.
        Vectors for other ngrams are initialized with a random uniform distribution in FastText. These
        vectors are discarded here to save space.

        """
        all_ngrams = []
        wv.vectors = np.zeros((len(wv.vocab), wv.vector_size), dtype=REAL)

        for w, vocab in wv.vocab.items():
            all_ngrams += _compute_ngrams(w, wv.min_n, wv.max_n)
            wv.vectors[vocab.index] += np.array(wv.vectors_ngrams[vocab.index])

        all_ngrams = set(all_ngrams)
        wv.num_ngram_vectors = len(all_ngrams)
        ngram_indices = []
        for i, ngram in enumerate(all_ngrams):
            ngram_hash = _ft_hash(ngram)
            ngram_indices.append(len(wv.vocab) + ngram_hash % self.bucket)
            wv.ngrams[ngram] = i
        wv.vectors_ngrams = wv.vectors_ngrams.take(ngram_indices, axis=0)

        ngram_weights = wv.vectors_ngrams

        logger.info("loading weights for %s words for fastText model from %s",
                    len(wv.vocab), file_name)

        for w, vocab in wv.vocab.items():
            word_ngrams = _compute_ngrams(w, wv.min_n, wv.max_n)
            for word_ngram in word_ngrams:
                wv.vectors[vocab.index] += np.array(
                    ngram_weights[wv.ngrams[word_ngram]])

            wv.vectors[vocab.index] /= (len(word_ngrams) + 1)
        logger.info("loaded %s weight matrix for fastText model from %s",
                    wv.vectors.shape, file_name)
Пример #10
0
def ftext_extract_ngrams(model, compressed=True):
    """
    Create a file containing all ngrams and their vectors
    """
    ngram_freq = defaultdict(int)
    for w, v in model.wv.vocab.items():
        for ng in _compute_ngrams(w, model.wv.min_n, model.wv.max_n):
            ngram_freq[ng] += v.count
    for ng, f in ngram_freq.items():
        pass
    if compressed:
        np.savez_compressed(file='model_ngrams.npz', vectors=model.wv.vectors_ngrams, ngrams=np.array(list(ngram_freq.keys())))
    else:
        ng_indexes = ['' for _ in range(len(model.wv.vectors_ngrams))]
        for ng, v in ngram_freq.items():
            ng_hash = model.wv.hash2index.get(_ft_hash(ng) % model.wv.bucket)
            if ng_hash and \
               (ng_indexes[ng_hash] == '' or ngram_freq[ng_indexes[ng_hash]] < ngram_freq[ng]):
                    ng_indexes[ng_hash] = ng
        d = pd.DataFrame(model.wv.vectors_ngrams, index=ng_indexes)
        d.to_csv('model_ngrams.csv.gz', compression='gzip', index_label="ngram")
Пример #11
0
    def init_ngrams_weights(self, wv, update=False, vocabulary=None):
        """Compute ngrams of all words present in vocabulary and stores vectors for only those ngrams.
        Vectors for other ngrams are initialized with a random uniform distribution in FastText.

        Parameters
        ----------
        update : bool
            If True, the new vocab words and their new ngrams word vectors are initialized
            with random uniform distribution and updated/added to the existing vocab word and ngram vectors.

        """
        if not update:
            wv.vectors_vocab = empty((len(wv.vocab), wv.vector_size), dtype=REAL)
            self.vectors_vocab_lockf = ones((len(wv.vocab), wv.vector_size), dtype=REAL)

            wv.vectors_ngrams = empty((self.bucket, wv.vector_size), dtype=REAL)
            self.vectors_ngrams_lockf = ones((self.bucket, wv.vector_size), dtype=REAL)

            wv.hash2index = {}
            wv.buckets_word = {}
            ngram_indices = []
            for word, vocab in wv.vocab.items():
                buckets = []
                for ngram in _compute_ngrams(word, wv.min_n, wv.max_n):
                    ngram_hash = _ft_hash(ngram) % self.bucket
                    if ngram_hash not in wv.hash2index:
                        wv.hash2index[ngram_hash] = len(ngram_indices)
                        ngram_indices.append(ngram_hash)
                    buckets.append(wv.hash2index[ngram_hash])
                wv.buckets_word[vocab.index] = tuple(buckets)
            wv.num_ngram_vectors = len(ngram_indices)

            logger.info("Total number of ngrams is %d", wv.num_ngram_vectors)

            wv.vectors_ngrams = wv.vectors_ngrams.take(ngram_indices, axis=0)
            self.vectors_ngrams_lockf = self.vectors_ngrams_lockf.take(ngram_indices, axis=0)
            self.reset_ngrams_weights(wv)
        else:
            wv.buckets_word = {}
            num_new_ngrams = 0
            for word, vocab in wv.vocab.items():
                buckets = []
                for ngram in _compute_ngrams(word, wv.min_n, wv.max_n):
                    ngram_hash = _ft_hash(ngram) % self.bucket
                    if ngram_hash not in wv.hash2index:
                        wv.hash2index[ngram_hash] = num_new_ngrams + self.old_hash2index_len
                        num_new_ngrams += 1
                    buckets.append(wv.hash2index[ngram_hash])
                wv.buckets_word[vocab.index] = tuple(buckets)

            wv.num_ngram_vectors += num_new_ngrams
            logger.info("Number of new ngrams is %d", num_new_ngrams)

            rand_obj = np.random
            rand_obj.seed(self.seed)
            new_vocab_rows = rand_obj.uniform(
                -1.0 / wv.vector_size, 1.0 / wv.vector_size,
                (len(wv.vocab) - vocabulary.old_vocab_len, wv.vector_size)
            ).astype(REAL)
            new_vocab_lockf_rows = ones(
                (len(wv.vocab) - vocabulary.old_vocab_len, wv.vector_size), dtype=REAL)
            new_ngram_rows = rand_obj.uniform(
                -1.0 / wv.vector_size, 1.0 / wv.vector_size,
                (len(wv.hash2index) - self.old_hash2index_len, wv.vector_size)
            ).astype(REAL)
            new_ngram_lockf_rows = ones(
                (len(wv.hash2index) - self.old_hash2index_len, wv.vector_size), dtype=REAL)

            wv.vectors_vocab = vstack([wv.vectors_vocab, new_vocab_rows])
            self.vectors_vocab_lockf = vstack([self.vectors_vocab_lockf, new_vocab_lockf_rows])
            wv.vectors_ngrams = vstack([wv.vectors_ngrams, new_ngram_rows])
            self.vectors_ngrams_lockf = vstack([self.vectors_ngrams_lockf, new_ngram_lockf_rows])
Пример #12
0
def loadModelandPCA(modelName=modelSelect.value):
    """
        Load selected model from the selection box and applies PCA on the first 'number_of_elements' word vectors
    """
    global model
    global vectors
    global words
    global ngrams
    global vectors_ngrams
    global words_ngrams
    global source
    global sourceTSNE
    global sourceNetwork

    LoadingDiv.css_classes = ["loading"]
    model = FastText.load(modelName)
    # model = FastText.load_fasttext_format('PRe_git/model/'+modelName+'.bin', encoding='ISO-8859-15')
    ## model = fasttext.load_model('PRe/model/model.bin')
    print("Data loaded ...")
    # vectors = [list(line) for line in data.values()]
    # words = list(data)
    vectors = model.wv.syn0
    words = model.wv.index2word
    ngrams = []
    vectors_ngrams = []
    words_ngrams = []
    for word in words:
        ngrams += _compute_ngrams(word, model.min_n, model.max_n)
    ngrams = set(ngrams)
    print('Ngrams done ...')
    i=0
    for ngram in ngrams:
        ngram_hash = _ft_hash(ngram) % model.bucket
        if ngram_hash in model.wv.hash2index:
            i += 1
            words_ngrams.append(ngram)
            vectors_ngrams.append(model.wv.vectors_ngrams[model.wv.hash2index[ngram_hash]])
    gr = _compute_ngrams(words[466], model.min_n, model.max_n)
    for g in gr:
        print(words[466], g, model.wv.similarity(words[466],g))
    
    NumberElementsDiv.text = "Nombre total de mots : "+str(len(words))
    d3.text = "<h2>Visualisation globale des repr\u00E9sentations</h2><br><h3>Vecteurs de dimension "+str(len(vectors[0]))+" projet\u00E9s dans le plan selon :</h3>"

    # PCA
    pcaProcess(modelName)

    sourceTSNE.data['x'] = [0 for i in range(0,number_of_elements)]
    sourceTSNE.data['y'] = [0 for i in range(0,number_of_elements)]
    sourceTSNE.data['mots'] = words[0:number_of_elements]
    sourceTSNE.data['color'] = ['#053061' for i in range(0,number_of_elements)]
    sourceTSNE.selected.indices = []

    sourceNetwork.data['label'] = []
    sourceNetwork.data['edges'] = []
    sourceNetwork.data['values'] = []
    sourceNetwork.data['index'] = []
    sourceNetwork.data['color'] = []

    print("Source done ...")
    source.trigger('data', None, source)
    sourceTSNE.trigger('data', None, sourceTSNE)
    sourceNetwork.trigger('data', None, sourceNetwork)

    LoadingDiv.css_classes = []