Пример #1
0
 def prepare(self):
     self.word2idx = defaultdict(int)
     # to make sure start_symbol, end_symbol, pad, and unk will be included
     self.word2idx[self.START_SYMBOL] = self.word2idx[
         self.END_SYMBOL] = self.word2idx[self.UNK] = self.word2idx[
             self.PAD] = self.min_word_freq
     for dataset_type in ["train", "val"]:
         caps = dset.CocoCaptions(
             root=FilePathManager.resolve(f'data/{dataset_type}'),
             annFile=FilePathManager.resolve(
                 f"data/annotations/captions_{dataset_type}2017.json"),
             transform=transforms.ToTensor())
         for _, captions in caps:
             for capt in captions:
                 tokens = self.tokenize(capt)
                 for token in tokens:
                     self.word2idx[token] += 1
     temp = {}
     embeddings = {}
     fast_text = FastText.load(
         FilePathManager.resolve("data/fasttext.model"), mmap="r")
     for k, v in self.word2idx.items():
         if v >= self.min_word_freq:
             temp[k] = len(temp)
             embeddings[k] = fast_text[k] if k in fast_text else fast_text[
                 self.UNK]
     self.word2idx = temp
     # swap keys and values
     self.idx2word = dict(zip(self.word2idx.values(), self.word2idx.keys()))
     self.fast_text = embeddings
Пример #2
0
def reduce_fasttext_embedding(fasttext_path, words):
    model = FT_wrapper.load(fasttext_path)
    print(model)
    word_to_embedding = {}
    coverage = 0
    for word in words:
        key = word.lower()
        if word in model:
            coverage = coverage + 1
            word_to_embedding[key] = model[key]
        else:
            word_to_embedding[key] = None
    print('fastText cache: {}/{} words'.format(coverage, len(words)))
    return word_to_embedding
Пример #3
0
# **Note:** As in the case of Word2Vec, you can continue to train your model while using Gensim's native implementation of fastText. However, continuation of training with fastText models while using the wrapper is not supported.

# ## Saving/loading models

# Models can be saved and loaded via the `load` and `save` methods.

# In[*]

# saving a model trained via Gensim's fastText implementation
model_gensim.save('saved_model_gensim')
loaded_model = FT_gensim.load('saved_model_gensim')
print(loaded_model)

# saving a model trained via fastText wrapper
model_wrapper.save('saved_model_wrapper')
loaded_model = FT_wrapper.load('saved_model_wrapper')
print(loaded_model)

# The `save_word2vec_method` causes the vectors for ngrams to be lost. As a result, a model loaded in this way will behave as a regular word2vec model.
#

# ## Word vector lookup
# **Note:** Operations like word vector lookups and similarity queries can be performed in exactly the same manner for both the implementations of fastText so they have been demonstrated using only the fastText wrapper here.
#
# FastText models support vector lookups for out-of-vocabulary words by summing up character ngrams belonging to the word.

# In[*]

print('night' in model_wrapper.wv.vocab)
print('nights' in model_wrapper.wv.vocab)
print(model_wrapper['night'])
Пример #4
0
 def load(self):
     self.logger.info("Loading model: {}".format(self.model_path))
     self.model = FT_wrapper.load(self.model_path)
     self.logger.info("Model loaded")
Пример #5
0
tstart = time.time()
model_wrapper = FT_wrapper.train(ft_home, data_dir + 'questions_file.txt')
tend = time.time()
print('Time elapsed for training wrapper model %.2f' % (tend - tstart))
print(model_wrapper)

# # saving a model trained via Gensim's fastText implementation
# print('Loading fasttext gensim model...')
# model_gensim.save(output_dir + 'saved_model_gensim')
# loaded_model = FT_gensim.load(output_dir + 'saved_model_gensim')
# print(loaded_model)

# saving a model trained via fastText wrapper
print('Loading fasttext wrapper model...')
model_wrapper.save(output_dir + 'saved_model_wrapper')
loaded_model = FT_wrapper.load(output_dir + 'saved_model_wrapper')
print(loaded_model)

print('night and nights?')
print('night' in model_wrapper.wv.vocab)
print('nights' in model_wrapper.wv.vocab)
print('night vec:')
print(model_wrapper['night'])
print('nights vec:')
print(model_wrapper['nights'])

print('night and nights similarity: %.4f' %
      model_wrapper.similarity("night", "nights"))
print('most similar to nights: %r' % model_wrapper.most_similar("nights"))

print(
Пример #6
0
def embdReader(embd_path,
               embd_dim,
               word_index,
               max_nb_words,
               fasttext_source='',
               ft_dim=0,
               ft_home='/data2/tonyq/fastText/fasttext',
               output_dir='/data2/tonyq/quora-output/',
               skip_header=False,
               initializer='glorot'):
    ########################################
    ## index word vectors
    ########################################
    if not embd_path == '':
        logger.info('Indexing word vectors...')
        embeddings_index = {}
        with open(embd_path, 'r', encoding='utf8') as f:
            if skip_header or embd_path.endswith('.vec'):
                next(f)
            for line in tqdm(f):
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
        logger.info('Found %d word vectors in embedding file.' %
                    len(embeddings_index))

    ########################################
    ## prepare fasttext
    ########################################
    if not fasttext_source == '':
        from gensim.models.wrappers.fasttext import FastText as FT_wrapper
        if fasttext_source.endswith('.bin'):
            loaded_model = FT_wrapper.load(fasttext_source)
            print(loaded_model)
        else:
            _, train_question1, train_question2 = get_pdTable(fasttext_source,
                                                              notag=True)
            train_question1, train_maxLen1 = text_cleaner(train_question1)
            train_question2, train_maxLen2 = text_cleaner(train_question2)
            train_data = train_question1 + train_question2
            print('Train data lines %d' % len(train_data))

            with open(output_dir + 'questions_file.txt', 'w') as fw:
                for line in train_data:
                    fw.write(line + '\n')
            print('Text saved to %s' % (output_dir + 'questions_file.txt'))

            # train the model
            print('Training wrapper fasttext model...')
            tstart = time.time()
            model_wrapper = FT_wrapper.train(ft_home,
                                             output_dir + 'questions_file.txt',
                                             size=ft_dim)
            tend = time.time()
            print('Time elapsed for training wrapper model %.2f' %
                  (tend - tstart))
            print(model_wrapper)

            # saving a model trained via fastText wrapper
            print('Loading fasttext wrapper model...')
            model_wrapper.save(output_dir + 'saved_model_wrapper.bin')

    ########################################
    ## prepare embeddings
    ########################################
    logger.info('Preparing embedding matrix based on given word list...')
    nb_words = min(max_nb_words, len(word_index)) + 1

    w2v_oov = 0
    ft_oov = []
    if initializer == 'zero':
        # zero initialization of embedding matrix
        embedding_matrix = np.zeros((nb_words, embd_dim + ft_dim))
    elif initializer == 'glorot':
        # glorot uniform initialization of embedding matrix
        scale = 1 / nb_words  # fan_in
        # scale = 1 / (embd_dim + ft_dim)   # fan_out
        limit = np.sqrt(3. * scale)
        embedding_matrix = np.random.uniform(low=-limit,
                                             high=limit,
                                             size=(nb_words,
                                                   embd_dim + ft_dim))
    else:
        raise NotImplementedError

    reverseDict = [''] * nb_words
    for word, i in tqdm(word_index.items()):
        if not embd_path == '':
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i][:embd_dim] = embedding_vector
                reverseDict[i] = word
            else:
                reverseDict[i] = '<' + word + '>'
                w2v_oov += 1
        if not fasttext_source == '':
            try:
                embedding_matrix[i][embd_dim:] = model_wrapper[word]
                reverseDict[i] = word
            except KeyError:
                reverseDict[i] = '<' + word + '>'
                ft_oov.append(word)

    logger.info('Word embeddings shape: %r (%d+%d)' %
                (embedding_matrix.shape, embd_dim, ft_dim))
    if not embd_path == '':
        logger.info('Word2Vec null embeddings: %d' % w2v_oov)
    if not fasttext_source == '':
        logger.info('FastText null embeddings: %d' % len(ft_oov))
        logger.info('FastText OOV: %r' % ft_oov)
    return embedding_matrix, reverseDict