def generate_fasttext(corpus,text_filepath,emb_path,cbow=False,min_count=2,minn=3, maxn=5, dim=200,epochs=5,lr=.1,neg=5,ws=5): try: os.makedirs(emb_path) except OSError: pass try: os.makedirs(text_filepath) except OSError: pass if type(corpus[0]) == list: corpus = [" ".join(i) for i in corpus] df = pd.DataFrame() df['text'] = corpus df.to_csv(os.path.join(text_filepath,'file.txt'),header=False,index=False) if cbow: model = fasttext.train_unsupervised(os.path.join(text_filepath,'file.txt'), os.path.join(emb_path,'ft'), "cbow", minCount=min_count, minn=minn, maxn=maxn, dim=dim, \ epoch=epochs, lr=lr, ws=ws, neg=neg) else: model = fasttext.train_unsupervised(os.path.join(text_filepath,'file.txt'), os.path.join(emb_path,'ft'), minCount=min_count, minn=minn, maxn=maxn, dim=dim, \ epoch=epochs, lr=lr, ws=ws, neg=neg)
def learn_embeddings(mode, sentences, dimensions, window_size, workers, iter, ind): """ Jointly Learn word-level and fact-level embeddings by optimizing the Language Model. :param ind: the index for each fact :param mode: the chosen language model :param sentences: the sequence sampled by node2vec :param dimensions: the number of dimensions :param window_size: the size of window in language model :param workers: the number of parnell threads :param iter: the number of epochs in SGD. :return: the word-level (model_W) and the fact-level (model_S) model """ np.savetxt(sen_file_path, np.array(sentences), fmt="%s", newline="\n") if mode == "skipgram": print(" +++Learning Word-level Embeddings++++") wm = ft.train_unsupervised(sen_file_path, model=mode, dim=dimensions) wm.save_model(word_model_path + "_" + mode + ".bin") print(" +++Learning Fact-level Embeddings++++") sent = list(list(map(str, s)) for s in sentences) fm = Word2Vec(sent, size=dimensions, window=window_size, min_count=0, sg=1, workers=workers, iter=iter) fm.wv.save_word2vec_format(fact_embedding_path, binary=False) fm.save(fact_model_path + "_" + mode + ".bin") # Turn fact into corresponding nodes semantic_to_fact(ind, dimensions) return wm if mode == "cbow": print(" +++Learning Word-level Embeddings++++") wm = ft.train_unsupervised(sen_file_path, model=mode, dim=dimensions) wm.save_model(word_model_path + "_" + mode + ".bin") print(" +++Learning Fact-level Embeddings++++") sent = list(list(map(str, s)) for s in sentences) fm = Word2Vec(sent, size=dimensions, window=window_size, min_count=0, sg=0, workers=workers, iter=iter) fm.wv.save_word2vec_format(fact_embedding_path, binary=False) fm.save(fact_model_path + "_" + mode + ".bin") # Turn fact into corresponding nodes semantic_to_fact(ind, dimensions) return wm
def train_fasttext_model(): model = ft.train_unsupervised('./twitter_corpora/corpora.txt', model='skipgram', dim=45) model.save_model('./fasttext/sk_fasttext.bin') model = ft.train_unsupervised('./twitter_corpora/corpora.txt', model='cbow', dim=45) model.save_model('./fasttext/cbow_fasttext.bin')
def finetune_model(self, model_type, overwrite=False): """ Method that trains an unsupervised fasttext model on our dataset for the given Video metadata type and stores is so that it can be used during the training of the Pseudoscience Classifier for extracting the embeddings from the input features :param model_type: 'video_snippet', 'video_tags', 'video_transcript', or 'video_comments' :param overwrite: whether to retrain and overwrite existing saved fastText model (if exists) :return: """ # Create fastText input data filename fasttext_model_filename = '{0}/fasttext_model_{1}.bin'.format( self.FEATURE_ENGINEERING_MODELS_DIR, model_type) if not os.path.isfile(fasttext_model_filename) or overwrite: # Train unspervised fastText model model = fasttext.train_unsupervised( input='{0}/{1}_train_data.txt'.format(self.DATA_DIR, model_type), pretrainedVectors='wiki-news-300d-1M.vec', dim=300, minn=2, maxn=5, thread=multiprocessing.cpu_count() - 1, # run in multiple cores verbose=2) # Save trained model model.save_model(fasttext_model_filename) return
def embeddings_from_docs( in_path, out_path, fasttext_path=None, word_vec_dim=300, min_count=5, n_epoch=20, minn=3, maxn=5, lr=0.05, ): # Read in docs with open(in_path, "rb") as f: docs = pickle.load(f) # Write docs to temporary *.txt file for fasttext to train on with open("tmp.txt", "w", encoding="utf-8") as f: for doc in docs: f.write("\n".join( [" ".join([word for word in sen]) for sen in doc.sentences])) # Train word embeddings model = fasttext.train_unsupervised( "tmp.txt", dim=word_vec_dim, minCount=min_count, epoch=n_epoch, minn=minn, maxn=maxn, lr=lr, ) model.save_model(out_path)
def train(self): """ update the language model """ self.model = None # remove the old model (for saving memory) current_time = datetime.datetime.now() file_name = "fasttext_{hash_code}_{year}_{month}_{day}".format(hash_code=abs(hash(current_time)), year=current_time.year, month=current_time.month, day=current_time.day) tmp_path = os.path.join(self.tmp_dir, file_name) # make corpus logger.info("Starting to build corpus for training, tmp file: {}".format(tmp_path)) with open(tmp_path, "w", encoding="utf-8") as f: for doc in self.db[self.collection].find({self.abstract_entry: {"$exists": True, "$ne": None}}): tokens = PreTokenize.tokenize(doc.get(self.abstract_entry, ""), True) if tokens: f.write(" ".join(tokens)+"\n") logger.info("Training the model -- Arguments: {}".format(self.training_args)) model = fasttext.train_unsupervised(input=tmp_path, **self.training_args) model.save_model(self.model_path) self.model = model # load new model # delete the tmp file os.remove(tmp_path) logger.info("Successfully save the new model and remove tmp file") self.db.metadata.update_one( {"data": "last_word_embedding_trained"}, {"$set": {"datetime": datetime.datetime.now()}} )
def train_word_vectors(input: WordVectorTrainingInput) -> WordVectorTrainingOutput: """Trains word vectors via [FastText](https://fasttext.cc) based on a provided text.""" with NamedTemporaryFile(suffix=".txt", mode="w", encoding="utf-8") as f: f.write(input.text) f.seek(0) model = fasttext.train_unsupervised( f.name, model=input.model.value, lr=input.learning_rate, dim=input.dimension, epoch=input.epoch, minCount=input.min_count, loss=input.loss_function, thread=1, # only train with one thread to not block other demos ) with NamedTemporaryFile(suffix=".vec", mode="w+b") as vec_file: words = model.get_words() for word in words: vec_file.write( str.encode( word + "".join(" " + str(vi) for vi in model.get_word_vector(word)) + "\n" ) ) vec_file.seek(0) return WordVectorTrainingOutput(vector_file=vec_file.read())
def create_model(texts): temp_file = "temp.txt" with open(temp_file, "w") as f: f.write(texts.str.cat(sep='\n')) model = fasttext.train_unsupervised(temp_file, minn=2, maxn=5, dim=100) os.remove(temp_file) return model
def create_fasttext_embedding_matrix( file_path: str, vocab: typing.Dict[str, int], embedding_dim: int) -> typing.Dict[str, np.ndarray]: """Train a fasttext model and return the embeddings.""" model_path = os.path.join(SHARED_PATH, 'embedding_models', f'fasttext_model_dim_{embedding_dim}.bin') if os.path.exists(model_path): logger.info('Loading fasttext embeddings...') model = fasttext.load_model(model_path) else: logger.info('Training fasttext embeddings...') model = fasttext.train_unsupervised(file_path, model='skipgram', dim=embedding_dim) model.save_model(model_path) embedding_matrix = np.zeros((len(vocab), model.get_dimension())) for word in vocab.keys(): idx = vocab[word] if word in model.words: embedding_matrix[idx] = model[word] else: pass # If word embedding is unknown, vector of zeros return embedding_matrix
def train_model(): model = fasttext.train_unsupervised(input="wiki_cut_word.txt", model="skipgram", ws=6, minn=2, thread=12) model.save_model("fasttext.wiki.model.bin")
def train_model(input_filename): model = fasttext.train_unsupervised(input_filename, model='skipgram', maxn=0, dim=100, ws=5) return model
def train_facebook_fasttext_embedding(data, emb_nm, minn=3, maxn=6, dim=100, epoch=5, lr=0.05, thread=4, max_vocab_size=200000): # unsupervised training with custom parameters emb = fasttext.train_unsupervised(data, minn=minn, maxn=maxn, dim=dim, epoch=epoch, lr=lr, thread=thread) # we only select the vocab_size most frequent terms # TODO this should probably be emb.words = [:max_vocab_size]. Use Gensim to change format and reduce size # TODO ref: https://medium.com/@vasnetsov93/shrinking-fasttext-embeddings-so-that-it-fits-google-colab-cd59ab75959e # del emb.words[max_vocab_size:] # saving trained model emb.save_model(emb_nm)
def create_fasttext_model(labels): """Runs Fastettext unsupervised to create a good model based on training set""" create_text_file(labels) model = fasttext.train_unsupervised('data_raw.txt', model='skipgram', dim=15) model.save_model("model_text_raw.bin")
def get_model(model_path: str, train_data_path: str): try: model = fasttext.load_model(model_path) except ValueError: model = fasttext.train_unsupervised(train_data_path, model='skipgram') model.save_model(model_path) return model
def fasttext_train_unsupervised(bpe_file, nwords, outf, dim=300, minCount=1, minn=10, maxn=10): words = set() with open(bpe_file, 'r') as f: for line in f: for w in line.split(): if w not in words: words.add(w) print('training word vecs by fasttext') model = fasttext.train_unsupervised(bpe_file, dim=dim, minCount=minCount, minn=minn, maxn=maxn) print('OK! training finished') words_vec = np.zeros((nwords, dim)) for i in range(nwords): if str(i) in words: words_vec[i, :] = model.get_word_vector(str(i)) np.savetxt(outf, words_vec, delimiter=',') print('OK! model saved to %s ' % (outf))
def fit(self, config): if self.pretrained: path = hydra.utils.to_absolute_path(config.word.embedding) self.model = fasttext.load_model(path) else: path = hydra.utils.to_absolute_path(config.data.train_path) self.model = fasttext.train_unsupervised(path, dim=self.dimensions)
def train_fasttext(hf_dataset, output_dir): """ Run with: $ ./data_cli.py train_fasttext paperswithcode_aspects ./output :return: """ tokens_fp = os.path.join(output_dir, 'tokens.txt') fasttext_bin_fp = os.path.join(output_dir, 'fasttext.bin') fasttext_w2v_fp = os.path.join(output_dir, 'fasttext.w2v.txt') docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='docs', cache_dir='./data/nlp_cache', split='docs') logger.info(f'Documents loaded: {len(docs_ds):,}') # Tokenized text doc_delimiter = '\n' token_delimiter = ' ' tokens_count = 0 with open(tokens_fp, 'w') as f: for doc in docs_ds: # Extract plain text text = doc['title'] + ': ' + doc['abstract'] for token in gensim.utils.simple_preprocess(text, min_len=2, max_len=15): f.write(token + token_delimiter) tokens_count += 1 f.write(doc_delimiter) logger.info(f'Total tokens: {tokens_count:,}') # Train actual fasttext model logger.info(f'Train fastext model...') model = fasttext.train_unsupervised( tokens_fp, model='skipgram', lr=0.05, # learning rate [0.05] dim=300, # size of word vectors [100] ws=5, # size of the context window [5] epoch=5 # number of epochs [5] # thread # number of threads [number of cpus] ) model.save_model(fasttext_bin_fp) del model ft_model = FastText.load_fasttext_format(fasttext_bin_fp) ft_model.wv.save_word2vec_format(fasttext_w2v_fp) logger.info(f'Output saved to: {fasttext_w2v_fp}') logger.info('Done')
def train_w2v(sentences, model='skipgram', dim=200, min_count=20, lr=0.015, ws=7, minn=3, maxn=6, epoch=20): """train word2vec ( via ``fasttext.unsupervised`` ). Args: sentences (list-like): list of raw sentences. model (str): model name (options are: 'skipgram' and 'cbow'). dim (int): embedding size. default is 200. min_count (int): filter words with less than ``min_count`` occurrences. lr (float): learning rate. ws (int): window-size. minn (int): subword min length (default: 3-char). maxn (int): subword max length (default: 6-char). epoch (int): num of training epochs. Returns: ``fasttext.FastText._FastText`` """ with tempfile.NamedTemporaryFile(mode='w', prefix='corpus-', suffix='.txt') as f: for raw_sentence in sentences: f.write(raw_sentence) f.write('\n') return fasttext.train_unsupervised(input=f.name, model=model, dim=dim, minCount=min_count, lr=lr, epoch=epoch, ws=ws, minn=minn, maxn=maxn)
def format_data_BRAND(self,blog_file,data_file,data_vec_file): data = load_data(data_file) n=data['y_h'].shape[0] # print n # return # only_txt = {i:data[i]['txt'] for i in data['data'].keys()} # self.dict_to_txt(only_txt, blog_file) model = fasttext.train_unsupervised(blog_file, model='skipgram') data_vec={'y':np.zeros(n),'c':np.zeros(n)} x=[] for tid,y_h in zip(data['data'].keys(),data['y_h']): blog = data['data'][tid]['txt'].replace('\n',' ').decode('utf-8') print blog print '****************************************************' x.append(model.get_sentence_vector(blog).flatten() ) # print y_h data_vec['y'][int(tid)] = np.mean(y_h) data_vec['c'][int(tid)] = np.mean((y_h-np.mean(y_h))**2)*0.01 # return plt.plot(data_vec['y'],label='y') plt.plot(data_vec['c'],label='c') plt.legend() plt.show() data_vec['x']=np.array(x) save(data_vec, data_vec_file)
def get_parameter_value_with_results(i, param, param_values, params_wordembeddings, params_training, tune, X_test, y_test): print(str(i)) model_name = "test_" + param + "_" + str(i) # bin_path = "word_vectors/fasttext/" + model_name + ".bin" vec_path = "word_vectors/fasttext/" + model_name + ".vec" if tune == "wordembeddings": ####### tuning parameter for fasttext WORD EMBEDDING params_wordembeddings[param] = param_values[i] embeddings = fasttext.train_unsupervised(input='data.txt', model='skipgram', **params_wordembeddings) # embeddings.save_model(bin_path) # embeddings = load_model(bin_path) ### convert from fasttext embeddings (would be saved as .bin) to .vec, # in order to use the embeddings .vec file as pretrainedVectors for fasttext text classification from_bin_to_vec(embeddings, vec_path) if tune == "training": ####### tuning parameter for fasttext TRAINING params_training[param] = param_values[i] # dimension of embeddings has to fit with dimension of look-up table (embeddings) in training model params_training["dim"] = embeddings.get_dimension() trained_model = fasttext.train_supervised(input=train_file, pretrainedVectors= vec_path, **params_training) ### find and apply optimal (threshold) cutoff point # get scores, i.e. list of probabilities for being labeled positive on set X_test y_scores = get_prediction_scores(trained_model,X_test) # find optimal probability threshold opt_threshold = find_optimal_cutoff(y_test, y_scores) # apply optimal threshold to the prediction probability and get label predictions y_pred = get_predictions(opt_threshold, y_scores) ################## Evaluation accuracy = metrics.accuracy_score(y_test, y_pred) precision = metrics.precision_score(y_test, y_pred) recall = metrics.recall_score(y_test, y_pred) auc = metrics.roc_auc_score(y_test, y_pred) auprc = metrics.average_precision_score(y_test, y_pred) return [accuracy, precision, recall, auc, auprc]
def train_unsupervised(args): # https://fasttext.cc/docs/en/unsupervised-tutorial.html model = fasttext.train_unsupervised(args.input, lr=args.lr, minCount=args.min_count, epoch=args.epoch, minn=args.minn, maxn=args.maxn, dim=args.dim, ws=args.ws) if not os.path.isdir(args.output_dir): print(f'Creating output directory: {args.output_dir}') os.makedirs(args.output_dir) model_fname = os.path.join(args.output_dir, 'model.bin') print(f'Saving model to: {model_fname}') model.save_model(model_fname) vec_fname = os.path.join(args.output_dir, f'word-vectors-{args.dim}d.txt') print(f'Saving word vectors to: {vec_fname}') bin_to_vec(model, vec_fname) count_fname = os.path.join(args.output_dir, f'word-counts.txt') print(f'Saving word count to: {count_fname}') bin_to_word_count(model, count_fname)
def train_fasttext(data_dir='./data', dim=300, epoch=5, ft_model='skipgram', ft_lr=0.05, ft_window=5): data_dir = Path(data_dir) import fasttext model = fasttext.train_unsupervised( str(data_dir / 'ocb_and_wikisource.w2v_tokens.txt'), model=ft_model, lr=ft_lr, # learning rate [0.05] dim=dim, # size of word vectors [100] ws=ft_window, # size of the context window [5] epoch=epoch # number of epochs [5] # thread # number of threads [number of cpus] ) model.save_model(str(data_dir / 'ocb_and_wikisource.fasttext.bin')) from gensim.models.wrappers import FastText ft_model = FastText.load_fasttext_format( str(data_dir / 'ocb_and_wikisource.fasttext.bin')) ft_model.wv.save_word2vec_format(data_dir / 'ocb_and_wikisource.fasttext.w2v.txt') logger.info('done')
def w2v_train(self, documents_input, w2v_model_output): # 预训练词向量并保存 print( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + ' : create word-segment without label txt') documents_cut = 'cache/msg_seg_without_label.txt' self.DP.file_cut_words(documents_input, documents_cut, mode='vec') print( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + ' : w2v train start') # skipgram模型训练生成词向量,结果输出到w2v_model_output:lr学习率,dim维数,min_count最小词频 model = fasttext.train_unsupervised(documents_cut, model='skipgram', lr=0.05, dim=self.dim, loss=self.loss, word_ngrams=self.word_ngrams, min_count=self.min_count) model.save_model(w2v_model_output) # os.remove(documents_cut) print( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + ' : w2v train done') return model
def test_as_array_produces_token_array() -> None: with tempfile.TemporaryDirectory() as tempdir: dataset_filename = f"{tempdir}/dataset.txt" pretrained_filename = f"{tempdir}/fasttext.model" with open(dataset_filename, "w") as fp: fp.write("\n".join( ["this is a first sentence", "this is a second sentence"])) model = fasttext.train_unsupervised( dataset_filename, model="skipgram", dim=10, minCount=1, ) model.save_model(pretrained_filename) indexer = FastTextTokenIndexer( pretrained_filename=pretrained_filename) tokens = [ Token(word) for word in "this is a test sentence".split() ] field = TextField(tokens, token_indexers={"tokens": indexer}) vocab = Vocabulary() field.index(vocab) array_dict = indexer.tokens_to_indices(tokens, vocab) assert len(array_dict["tokens"]) == 5 assert len(array_dict["tokens"][0]) == 10
def generate_embedding(self): classifier = fasttext.train_unsupervised(input=self.train_file, dim=self.vec_dim, epoch=self.epoch, minCount=10, thread=10) return self.get_res(classifier)
def train_fasttext(self, data, model_name, epoch): if self.is_train: model = fasttext.train_unsupervised(data, model='skipgram', minCount=1, epoch=epoch) model.save_model(model_name)
def build(data, size, mincount, path): """ Builds fastText vectors from a file. Args: data: path to input data file size: number of vector dimensions mincount: minimum number of occurrences required to register a token path: path to output file """ # Train on data file using largest dimension size model = fasttext.train_unsupervised(data, dim=size, minCount=mincount) # Output file path print("Building %d dimension model" % size) # Output vectors in vec/txt format with open(path + ".txt", "w") as output: words = model.get_words() output.write("%d %d\n" % (len(words), model.get_dimension())) for word in words: # Skip end of line token if word != "</s>": vector = model.get_word_vector(word) data = "" for v in vector: data += " " + str(v) output.write(word + data + "\n") # Build magnitude vectors database print("Converting vectors to magnitude format") converter.convert(path + ".txt", path + ".magnitude", subword=True)
def generate_fasttext_skipgram(data_file, train_iter, emb_size, output_file): model = fasttext.train_unsupervised(input=data_file, model='skipgram', dim=emb_size, minCount=5, verbose=2, thread=8) model_output = output_file.replace(".txt", ".bin") text_output = output_file model.save_model(model_output) fasttext_to_text.export_to_file(model_output, text_output)
def __generate_embeddings(self, file_path): self.printer.print('generating fasttext term embeddings') tmp_file = os.path.join(self.args.local_dir, 'tmp') with open(tmp_file, 'w', encoding='utf8') as f_out: with open(os.path.join(self.args.local_dir, self.args.file_in_qs_train), 'rt', encoding='utf8') as f_in: reader = csv.reader(f_in, delimiter='\t') for [_, q] in reader: f_out.write(q) f_out.write('\n') with open(os.path.join(self.args.local_dir, self.args.file_in_docs), 'rt', encoding='utf8') as f_in: reader = csv.reader(f_in, delimiter='\t') for row in reader: f_out.write('\n'.join(row[1:])) f_out.write('\n') self.printer.print('training fasttext term embeddings') embeddings = fasttext.train_unsupervised( tmp_file, model='skipgram', dim=self.args.num_hidden_nodes // 2, bucket=10000, minCount=100, minn=1, maxn=0, ws=10, epoch=5) embeddings.save_model(file_path) os.remove(tmp_file)
def train_fasttext(corpus, cut_func, vocabulary, embedding_dim=300): corpus = [' '.join(cut_func(sentence)) for sentence in corpus] corpus_file_path = 'fasttext_tmp_corpus.txt' with open(corpus_file_path, 'w', encoding='utf8') as writer: for sentence in corpus: writer.write(sentence + '\n') model = train_unsupervised(input=corpus_file_path, model='skipgram', epoch=10, minCount=1, wordNgrams=3, dim=300) model_vocab = model.get_words() emb = np.zeros(shape=(len(vocabulary) + 1, embedding_dim), dtype='float32') nb_unk = 0 for w, i in vocabulary.items(): if w not in model_vocab: nb_unk += 1 emb[i, :] = np.random.normal(0, 0.05, embedding_dim) else: emb[i, :] = model.get_word_vector(w) print( 'Logging Info - Fasttext Embedding matrix created: {}, unknown tokens: {}' .format(emb.shape, nb_unk)) os.remove(corpus_file_path) return emb