def fit(self, X, y=None): X.to_csv(self.inputFile, index=False) corpus_file = datapath(self.inputFile) self.model_wrapper = FT_wrapper.train(self.ft_home, self.inputFile, model=self.model, size=self.size, word_ngrams=self.word_ngrams) return self
def test_sg_hs_against_wrapper(self): if self.ft_path is None: logger.info("FT_HOME env variable not set, skipping test") return tmpf = get_tmpfile('gensim_fasttext.tst') model_wrapper = FT_wrapper.train( ft_path=self.ft_path, corpus_file=datapath('lee_background.cor'), output_file=tmpf, model='skipgram', size=50, alpha=0.025, window=5, min_count=5, word_ngrams=1, loss='hs', sample=1e-3, negative=0, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12) model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) model_gensim.build_vocab(lee_data) orig0 = np.copy(model_gensim.wv.syn0[0]) model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) self.assertFalse((orig0 == model_gensim.wv.syn0[0] ).all()) # vector should vary after training self.compare_with_wrapper(model_gensim, model_wrapper)
def load_pretrained_fasttext(): # Set FastText home to the path to the FastText executable ft_home = '/home/dev/fastText/fasttext' # Set file names for train and test data train_file = config.pos_path # Use FaceBook Corpus #model = FastText.load_word2vec_format('/home/dev/wiki.ko.vec') model = FastText.train(ft_home, train_file, min_count=1) print(model) result = model.most_similar(positive=['김승우']) print(result) return model
def test_sg_hs_against_wrapper(self): if self.ft_path is None: logger.info("FT_HOME env variable not set, skipping test") return model_wrapper = FT_wrapper.train(ft_path=self.ft_path, corpus_file=datapath('lee_background.cor'), output_file=testfile(), model='skipgram', size=50, alpha=0.025, window=5, min_count=5, word_ngrams=1, loss='hs', sample=1e-3, negative=0, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12) model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) model_gensim.build_vocab(lee_data) orig0 = np.copy(model_gensim.wv.syn0[0]) model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all()) # vector should vary after training self.compare_with_wrapper(model_gensim, model_wrapper)
def train_wikipedia(ft_home, input_path, output_path, iterations=5, min_n=3, max_n=3): model = FT_wrapper.train(ft_home, input_path, min_n=min_n, max_n=max_n, iter=iterations) model.save(output_path)
total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs) print(model_gensim) # ### Using wrapper for fastText's C++ code # In[*] from gensim.models.wrappers.fasttext import FastText as FT_wrapper # Set FastText home to the path to the FastText executable ft_home = '/usr/local/bin/fasttext' # train the model model_wrapper = FT_wrapper.train(ft_home, lee_train_file) print(model_wrapper) # ### Training hyperparameters # Hyperparameters for training the model follow the same pattern as Word2Vec. FastText supports the folllowing parameters from the original word2vec - # - model: Training architecture. Allowed values: `cbow`, `skipgram` (Default `cbow`) # - size: Size of embeddings to be learnt (Default 100) # - alpha: Initial learning rate (Default 0.025) # - window: Context window size (Default 5) # - min_count: Ignore words with number of occurrences below this (Default 5) # - loss: Training objective. Allowed values: `ns`, `hs`, `softmax` (Default `ns`) # - sample: Threshold for downsampling higher-frequency words (Default 0.001) # - negative: Number of negative words to sample, for `ns` (Default 5) # - iter: Number of epochs (Default 5)
v = np.array(v) print(np.shape(v)) text_len = np.array([len(s) for s in text]).reshape(len(text), 1) X = np.concatenate((text_len, v), axis=1) print(np.shape(X)) elif mode == 'ft': # -------------------------------------------------------------------- # FastText print('generating fasttext') text = [clean_text(s).split() for s in text] dim = 200 model = FastText(size=dim, iter=1) #model.build_vocab(text) model.train(text) # then calculate word vector per paragraph print('generating paragraph vectors') v = [] for s in text: ww = np.zeros((dim)) n = 0 for k, w in enumerate(s): if w in model.wv: ww += model.wv[w] n += 1 if n > 0: v.append(ww / n) else: v.append(ww)
def train_fasttext(corpus_file, fasttext_path=None, save="../data/embeddings/", dim=300): """ Input: corpus_file: the path to the file that has the embedding training dataset. fasttext_path: path to the FastText executable. If not given, we use the gensim reimplementation instead. save: the directory where the embeddings will be saved. dim: number of dimensions for the embeddings. Output: A file with the embeddings both in gensim format and in word2vec format. It also returns the model itself. """ print("Generating embeddings...") if fasttext_path is not None: # Run this if FastText is installed print("FastText wrapper loaded") # Set FastText home to the path to the FastText executable ft_home = fasttext_path print("\nCreating embeddings model...") # train the model model = FT_wrapper.train(ft_home, corpus_file, sg=1, size=dim) print("Model created and trained") else: # Run this if using windows or if FastText is not installed print("Gensim implementation loaded") print("\nCreating embeddings model...") model = FT_gensim(size=dim, sg=1) print("Model created") # build the vocabulary print("\nGenerating vocabulary...") model.build_vocab(corpus_file=corpus_file) print("Vocabulary generated") # train the model print("\nTraining embeddings model") model.train(corpus_file=corpus_file, epochs=model.epochs, total_examples=model.corpus_count, total_words=model.corpus_total_words) print("Model trained:") print(model, "\n") # saving a model if save is not None: path = save + "ft_embeddings." + str(dim) model.save(path + ".model") model.wv.save_word2vec_format(path + ".vec") gg = open(path + ".txt", 'w', encoding="utf8") for token in model.wv.vocab.keys(): string = token for value in model.wv[token]: string += " " + str(value) gg.write(string + '\n') gg.close() print("Embeddings saved\n") print("") return model
# print('Training gensim fasttext model...') # tstart = time.time() # model_gensim.train(train_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) # tend = time.time() # print('Time elapsed for training wrapper model %.2f' % (tend - tstart)) # print(model_gensim) with open(data_dir + 'questions_file.txt', 'w') as fw: for line in train_data: fw.write(line + '\n') print('Text saved to %s' % (data_dir + 'questions_file.txt')) # train the model print('Training wrapper fasttext model...') tstart = time.time() model_wrapper = FT_wrapper.train(ft_home, data_dir + 'questions_file.txt') tend = time.time() print('Time elapsed for training wrapper model %.2f' % (tend - tstart)) print(model_wrapper) # # saving a model trained via Gensim's fastText implementation # print('Loading fasttext gensim model...') # model_gensim.save(output_dir + 'saved_model_gensim') # loaded_model = FT_gensim.load(output_dir + 'saved_model_gensim') # print(loaded_model) # saving a model trained via fastText wrapper print('Loading fasttext wrapper model...') model_wrapper.save(output_dir + 'saved_model_wrapper') loaded_model = FT_wrapper.load(output_dir + 'saved_model_wrapper') print(loaded_model)
def embdReader(embd_path, embd_dim, word_index, max_nb_words, fasttext_source='', ft_dim=0, ft_home='/data2/tonyq/fastText/fasttext', output_dir='/data2/tonyq/quora-output/', skip_header=False, initializer='glorot'): ######################################## ## index word vectors ######################################## if not embd_path == '': logger.info('Indexing word vectors...') embeddings_index = {} with open(embd_path, 'r', encoding='utf8') as f: if skip_header or embd_path.endswith('.vec'): next(f) for line in tqdm(f): values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs logger.info('Found %d word vectors in embedding file.' % len(embeddings_index)) ######################################## ## prepare fasttext ######################################## if not fasttext_source == '': from gensim.models.wrappers.fasttext import FastText as FT_wrapper if fasttext_source.endswith('.bin'): loaded_model = FT_wrapper.load(fasttext_source) print(loaded_model) else: _, train_question1, train_question2 = get_pdTable(fasttext_source, notag=True) train_question1, train_maxLen1 = text_cleaner(train_question1) train_question2, train_maxLen2 = text_cleaner(train_question2) train_data = train_question1 + train_question2 print('Train data lines %d' % len(train_data)) with open(output_dir + 'questions_file.txt', 'w') as fw: for line in train_data: fw.write(line + '\n') print('Text saved to %s' % (output_dir + 'questions_file.txt')) # train the model print('Training wrapper fasttext model...') tstart = time.time() model_wrapper = FT_wrapper.train(ft_home, output_dir + 'questions_file.txt', size=ft_dim) tend = time.time() print('Time elapsed for training wrapper model %.2f' % (tend - tstart)) print(model_wrapper) # saving a model trained via fastText wrapper print('Loading fasttext wrapper model...') model_wrapper.save(output_dir + 'saved_model_wrapper.bin') ######################################## ## prepare embeddings ######################################## logger.info('Preparing embedding matrix based on given word list...') nb_words = min(max_nb_words, len(word_index)) + 1 w2v_oov = 0 ft_oov = [] if initializer == 'zero': # zero initialization of embedding matrix embedding_matrix = np.zeros((nb_words, embd_dim + ft_dim)) elif initializer == 'glorot': # glorot uniform initialization of embedding matrix scale = 1 / nb_words # fan_in # scale = 1 / (embd_dim + ft_dim) # fan_out limit = np.sqrt(3. * scale) embedding_matrix = np.random.uniform(low=-limit, high=limit, size=(nb_words, embd_dim + ft_dim)) else: raise NotImplementedError reverseDict = [''] * nb_words for word, i in tqdm(word_index.items()): if not embd_path == '': embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i][:embd_dim] = embedding_vector reverseDict[i] = word else: reverseDict[i] = '<' + word + '>' w2v_oov += 1 if not fasttext_source == '': try: embedding_matrix[i][embd_dim:] = model_wrapper[word] reverseDict[i] = word except KeyError: reverseDict[i] = '<' + word + '>' ft_oov.append(word) logger.info('Word embeddings shape: %r (%d+%d)' % (embedding_matrix.shape, embd_dim, ft_dim)) if not embd_path == '': logger.info('Word2Vec null embeddings: %d' % w2v_oov) if not fasttext_source == '': logger.info('FastText null embeddings: %d' % len(ft_oov)) logger.info('FastText OOV: %r' % ft_oov) return embedding_matrix, reverseDict