def fit(self, X, y=None):
     X.to_csv(self.inputFile, index=False)
     corpus_file = datapath(self.inputFile)
     self.model_wrapper = FT_wrapper.train(self.ft_home,
                                           self.inputFile,
                                           model=self.model,
                                           size=self.size,
                                           word_ngrams=self.word_ngrams)
     return self
示例#2
0
    def test_sg_hs_against_wrapper(self):
        if self.ft_path is None:
            logger.info("FT_HOME env variable not set, skipping test")
            return

        tmpf = get_tmpfile('gensim_fasttext.tst')
        model_wrapper = FT_wrapper.train(
            ft_path=self.ft_path,
            corpus_file=datapath('lee_background.cor'),
            output_file=tmpf,
            model='skipgram',
            size=50,
            alpha=0.025,
            window=5,
            min_count=5,
            word_ngrams=1,
            loss='hs',
            sample=1e-3,
            negative=0,
            iter=5,
            min_n=3,
            max_n=6,
            sorted_vocab=1,
            threads=12)

        model_gensim = FT_gensim(size=50,
                                 sg=1,
                                 cbow_mean=1,
                                 alpha=0.025,
                                 window=5,
                                 hs=1,
                                 negative=0,
                                 min_count=5,
                                 iter=5,
                                 batch_words=1000,
                                 word_ngrams=1,
                                 sample=1e-3,
                                 min_n=3,
                                 max_n=6,
                                 sorted_vocab=1,
                                 workers=1,
                                 min_alpha=0.0)

        lee_data = LineSentence(datapath('lee_background.cor'))
        model_gensim.build_vocab(lee_data)
        orig0 = np.copy(model_gensim.wv.syn0[0])
        model_gensim.train(lee_data,
                           total_examples=model_gensim.corpus_count,
                           epochs=model_gensim.iter)
        self.assertFalse((orig0 == model_gensim.wv.syn0[0]
                          ).all())  # vector should vary after training
        self.compare_with_wrapper(model_gensim, model_wrapper)
示例#3
0
def load_pretrained_fasttext():
    # Set FastText home to the path to the FastText executable
    ft_home = '/home/dev/fastText/fasttext'

    # Set file names for train and test data
    train_file = config.pos_path

    # Use FaceBook Corpus
    #model = FastText.load_word2vec_format('/home/dev/wiki.ko.vec')
    model = FastText.train(ft_home, train_file, min_count=1)

    print(model)

    result = model.most_similar(positive=['김승우'])
    print(result)

    return model
示例#4
0
    def test_sg_hs_against_wrapper(self):
        if self.ft_path is None:
            logger.info("FT_HOME env variable not set, skipping test")
            return

        model_wrapper = FT_wrapper.train(ft_path=self.ft_path, corpus_file=datapath('lee_background.cor'),
            output_file=testfile(), model='skipgram', size=50, alpha=0.025, window=5, min_count=5, word_ngrams=1,
            loss='hs', sample=1e-3, negative=0, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12)

        model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0,
            min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
            sorted_vocab=1, workers=1, min_alpha=0.0)

        lee_data = LineSentence(datapath('lee_background.cor'))
        model_gensim.build_vocab(lee_data)
        orig0 = np.copy(model_gensim.wv.syn0[0])
        model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter)
        self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all())  # vector should vary after training
        self.compare_with_wrapper(model_gensim, model_wrapper)
def train_wikipedia(ft_home, input_path, output_path, iterations=5, min_n=3, max_n=3):
    model = FT_wrapper.train(ft_home, input_path, min_n=min_n, max_n=max_n, iter=iterations)
    model.save(output_path)
示例#6
0
                   total_examples=model_gensim.corpus_count,
                   epochs=model_gensim.epochs)

print(model_gensim)

# ### Using wrapper for fastText's C++ code

# In[*]

from gensim.models.wrappers.fasttext import FastText as FT_wrapper

# Set FastText home to the path to the FastText executable
ft_home = '/usr/local/bin/fasttext'

# train the model
model_wrapper = FT_wrapper.train(ft_home, lee_train_file)

print(model_wrapper)

# ### Training hyperparameters

# Hyperparameters for training the model follow the same pattern as Word2Vec. FastText supports the folllowing parameters from the original word2vec -
#      - model: Training architecture. Allowed values: `cbow`, `skipgram` (Default `cbow`)
#      - size: Size of embeddings to be learnt (Default 100)
#      - alpha: Initial learning rate (Default 0.025)
#      - window: Context window size (Default 5)
#      - min_count: Ignore words with number of occurrences below this (Default 5)
#      - loss: Training objective. Allowed values: `ns`, `hs`, `softmax` (Default `ns`)
#      - sample: Threshold for downsampling higher-frequency words (Default 0.001)
#      - negative: Number of negative words to sample, for `ns` (Default 5)
#      - iter: Number of epochs (Default 5)
示例#7
0
    v = np.array(v)

    print(np.shape(v))
    text_len = np.array([len(s) for s in text]).reshape(len(text), 1)
    X = np.concatenate((text_len, v), axis=1)

    print(np.shape(X))
elif mode == 'ft':
    # --------------------------------------------------------------------
    # FastText
    print('generating fasttext')
    text = [clean_text(s).split() for s in text]
    dim = 200
    model = FastText(size=dim, iter=1)
    #model.build_vocab(text)
    model.train(text)
    # then calculate word vector per paragraph
    print('generating paragraph vectors')
    v = []
    for s in text:
        ww = np.zeros((dim))
        n = 0
        for k, w in enumerate(s):
            if w in model.wv:
                ww += model.wv[w]
                n += 1
        if n > 0:
            v.append(ww / n)
        else:
            v.append(ww)
示例#8
0
def train_fasttext(corpus_file,
                   fasttext_path=None,
                   save="../data/embeddings/",
                   dim=300):
    """
    
    Input:
        corpus_file:
            the path to the file that has the embedding training dataset.
        fasttext_path:
            path to the FastText executable. If not given, we use the gensim
            reimplementation instead.
        save:
            the directory where the embeddings will be saved.
        dim:
            number of dimensions for the embeddings.
            
    Output:
        A file with the embeddings both in gensim format and in word2vec format.
        It also returns the model itself.
        
    """

    print("Generating embeddings...")

    if fasttext_path is not None:
        # Run this if FastText is installed

        print("FastText wrapper loaded")

        # Set FastText home to the path to the FastText executable
        ft_home = fasttext_path

        print("\nCreating embeddings model...")

        # train the model
        model = FT_wrapper.train(ft_home, corpus_file, sg=1, size=dim)

        print("Model created and trained")

    else:
        # Run this if using windows or if FastText is not installed

        print("Gensim implementation loaded")

        print("\nCreating embeddings model...")
        model = FT_gensim(size=dim, sg=1)
        print("Model created")

        # build the vocabulary
        print("\nGenerating vocabulary...")
        model.build_vocab(corpus_file=corpus_file)
        print("Vocabulary generated")

        # train the model
        print("\nTraining embeddings model")
        model.train(corpus_file=corpus_file,
                    epochs=model.epochs,
                    total_examples=model.corpus_count,
                    total_words=model.corpus_total_words)
        print("Model trained:")

    print(model, "\n")

    # saving a model
    if save is not None:
        path = save + "ft_embeddings." + str(dim)
        model.save(path + ".model")

        model.wv.save_word2vec_format(path + ".vec")

        gg = open(path + ".txt", 'w', encoding="utf8")
        for token in model.wv.vocab.keys():
            string = token
            for value in model.wv[token]:
                string += " " + str(value)
            gg.write(string + '\n')
        gg.close()

        print("Embeddings saved\n")

    print("")

    return model
示例#9
0
# print('Training gensim fasttext model...')
# tstart = time.time()
# model_gensim.train(train_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter)
# tend = time.time()
# print('Time elapsed for training wrapper model %.2f' % (tend - tstart))
# print(model_gensim)

with open(data_dir + 'questions_file.txt', 'w') as fw:
    for line in train_data:
        fw.write(line + '\n')
print('Text saved to %s' % (data_dir + 'questions_file.txt'))

# train the model
print('Training wrapper fasttext model...')
tstart = time.time()
model_wrapper = FT_wrapper.train(ft_home, data_dir + 'questions_file.txt')
tend = time.time()
print('Time elapsed for training wrapper model %.2f' % (tend - tstart))
print(model_wrapper)

# # saving a model trained via Gensim's fastText implementation
# print('Loading fasttext gensim model...')
# model_gensim.save(output_dir + 'saved_model_gensim')
# loaded_model = FT_gensim.load(output_dir + 'saved_model_gensim')
# print(loaded_model)

# saving a model trained via fastText wrapper
print('Loading fasttext wrapper model...')
model_wrapper.save(output_dir + 'saved_model_wrapper')
loaded_model = FT_wrapper.load(output_dir + 'saved_model_wrapper')
print(loaded_model)
示例#10
0
def embdReader(embd_path,
               embd_dim,
               word_index,
               max_nb_words,
               fasttext_source='',
               ft_dim=0,
               ft_home='/data2/tonyq/fastText/fasttext',
               output_dir='/data2/tonyq/quora-output/',
               skip_header=False,
               initializer='glorot'):
    ########################################
    ## index word vectors
    ########################################
    if not embd_path == '':
        logger.info('Indexing word vectors...')
        embeddings_index = {}
        with open(embd_path, 'r', encoding='utf8') as f:
            if skip_header or embd_path.endswith('.vec'):
                next(f)
            for line in tqdm(f):
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
        logger.info('Found %d word vectors in embedding file.' %
                    len(embeddings_index))

    ########################################
    ## prepare fasttext
    ########################################
    if not fasttext_source == '':
        from gensim.models.wrappers.fasttext import FastText as FT_wrapper
        if fasttext_source.endswith('.bin'):
            loaded_model = FT_wrapper.load(fasttext_source)
            print(loaded_model)
        else:
            _, train_question1, train_question2 = get_pdTable(fasttext_source,
                                                              notag=True)
            train_question1, train_maxLen1 = text_cleaner(train_question1)
            train_question2, train_maxLen2 = text_cleaner(train_question2)
            train_data = train_question1 + train_question2
            print('Train data lines %d' % len(train_data))

            with open(output_dir + 'questions_file.txt', 'w') as fw:
                for line in train_data:
                    fw.write(line + '\n')
            print('Text saved to %s' % (output_dir + 'questions_file.txt'))

            # train the model
            print('Training wrapper fasttext model...')
            tstart = time.time()
            model_wrapper = FT_wrapper.train(ft_home,
                                             output_dir + 'questions_file.txt',
                                             size=ft_dim)
            tend = time.time()
            print('Time elapsed for training wrapper model %.2f' %
                  (tend - tstart))
            print(model_wrapper)

            # saving a model trained via fastText wrapper
            print('Loading fasttext wrapper model...')
            model_wrapper.save(output_dir + 'saved_model_wrapper.bin')

    ########################################
    ## prepare embeddings
    ########################################
    logger.info('Preparing embedding matrix based on given word list...')
    nb_words = min(max_nb_words, len(word_index)) + 1

    w2v_oov = 0
    ft_oov = []
    if initializer == 'zero':
        # zero initialization of embedding matrix
        embedding_matrix = np.zeros((nb_words, embd_dim + ft_dim))
    elif initializer == 'glorot':
        # glorot uniform initialization of embedding matrix
        scale = 1 / nb_words  # fan_in
        # scale = 1 / (embd_dim + ft_dim)   # fan_out
        limit = np.sqrt(3. * scale)
        embedding_matrix = np.random.uniform(low=-limit,
                                             high=limit,
                                             size=(nb_words,
                                                   embd_dim + ft_dim))
    else:
        raise NotImplementedError

    reverseDict = [''] * nb_words
    for word, i in tqdm(word_index.items()):
        if not embd_path == '':
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i][:embd_dim] = embedding_vector
                reverseDict[i] = word
            else:
                reverseDict[i] = '<' + word + '>'
                w2v_oov += 1
        if not fasttext_source == '':
            try:
                embedding_matrix[i][embd_dim:] = model_wrapper[word]
                reverseDict[i] = word
            except KeyError:
                reverseDict[i] = '<' + word + '>'
                ft_oov.append(word)

    logger.info('Word embeddings shape: %r (%d+%d)' %
                (embedding_matrix.shape, embd_dim, ft_dim))
    if not embd_path == '':
        logger.info('Word2Vec null embeddings: %d' % w2v_oov)
    if not fasttext_source == '':
        logger.info('FastText null embeddings: %d' % len(ft_oov))
        logger.info('FastText OOV: %r' % ft_oov)
    return embedding_matrix, reverseDict