def train_encoder(name_of_data, sentences, max_epochs=5, save_frequency=1000): if not os.path.exists('data/'): os.makedirs('data') sys.path.insert(0, 'training/') import vocab worddict, wordcount = vocab.build_dictionary(sentences) vocab.save_dictionary(worddict, wordcount, 'data/' + name_of_data + '_dictionary.pkl') pickle.dump(sentences, open('data/' + name_of_data + '_sen.p', 'w')) with open('training/train.py', 'r') as f: text = f.read() text = text.replace('max_epochs=5', 'max_epochs=' + str(max_epochs)) text = text.replace('saveto=\'/u/rkiros/research/semhash/models/toy.npz\'',\ 'saveto=\'data/' + name_of_data + '_encoder.npz\'') text = text.replace('dictionary=\'/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl\'',\ 'dictionary=\'data/' + name_of_data + '_dictionary.pkl\'') text = text.replace('n_words=20000', 'n_words=' + str(len(wordcount.keys()))) text = text.replace('saveFreq=1000', 'saveFreq=' + str(save_frequency)) g = open('training/train_temp.py', 'w') g.write(text) g.close() import train_temp train_temp.trainer(sentences)
def main(data_path, dict_path, save_path, batch_size, reload_, reload_path): os.environ["THEANO_FLAGS"] = "floatX=float32" file_names = get_file_list(data_path, ['txt']) train_sent = load_txt_sent(file_names) if not os.path.exists(dict_path): print "Dictionary not found, recreating" worddict, wordcount = vocab.build_dictionary(train_sent) print "Built. Saving to: {}".format(dict_path) vocab.save_dictionary(worddict, wordcount, dict_path) else: print "Found dictionary at {}... Loading...".format(dict_path) worddict = vocab.load_dictionary(dict_path) print "Beginning Training..." train.trainer(train_sent, batch_size=batch_size, reload_=reload_, dictionary=dict_path, saveto=save_path, reload_path=reload_path, saveFreq=10000)
def train_decoder(name_of_data, sentences, model, p, max_epochs=5, save_frequency=1000, n_words=20000, maxlen_w=30, reload_=False): if not os.path.exists('data/'): os.makedirs('data') sys.path.insert(1, 'decoding/') import vocab reload(vocab) worddict, wordcount = vocab.build_dictionary(sentences, n_words) vocab.save_dictionary(worddict, wordcount, 'data/' + name_of_data + '_dictionary.pkl') with open('decoding/train.py', 'r') as f: text = f.read() text = text.replace('max_epochs=5', 'max_epochs=' + str(max_epochs)) text = text.replace('saveto=\'/u/rkiros/research/semhash/models/toy.npz\'',\ 'saveto=\'data/' + name_of_data + '_decoder.npz\'') text = text.replace('dictionary=\'/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl\'',\ 'dictionary=\'data/' + name_of_data + '_dictionary.pkl\'') text = text.replace('n_words=40000', 'n_words=' + str(len(wordcount.keys()))) text = text.replace('saveFreq=1000', 'saveFreq=' + str(save_frequency)) g = open('decoding/train_temp.py', 'w') g.write(text) g.close() import train_temp reload(train_temp) return train_temp.trainer(sentences, sentences, model, p, maxlen_w=maxlen_w, reload_=reload_)
def main(): parser = argparse.ArgumentParser( description='Pass target style genre to train decoder') parser.add_argument('-s', '--style_genre', help='the name of style corpus', required='True', default='localhost') flag = parser.parse_args() style_corpus_path = "/media/VSlab3/kuanchen_arxiv/artistic_style_corpora/{}".format( flag.style_genre) style_genre = flag.style_genre.split(".")[0] X = [] with open(style_corpus_path, 'r') as handle: for line in handle.readlines(): X.append(line.strip()) C = X if not os.path.isfile("./vocab_save/{}.pkl".format(style_genre)): print "Get vocabulary..." worddict, wordcount = vocab.build_dictionary(X) vocab.save_dictionary(worddict=worddict, wordcount=wordcount, loc="vocab_save/{}.pkl".format(style_genre)) else: pass savepath = "./logs_{}".format(style_genre) if not os.path.exists(savepath): os.mkdir(savepath) skmodel = skipthoughts.load_model() train.trainer(X, C, skmodel, dictionary="vocab_save/{}.pkl".format(style_genre), savepath=savepath, saveto="model.npz")
target_name = args.targe_text.split("/")[-1].split(".")[ 0] # Get target text file name. eg. "speeches.txt" download_model() print("Loading Skip-Vector Model...") skmodel = skipthoughts.load_model() print("Done!") """ Step 1: Generating dictionary for the target text. """ print("Generating dictionary for the target text...") X = load_text(args.targe_text) worddict, wordcount = vocab.build_dictionary(X) #vocab.save_dictionary(worddict, wordcount, './target_dict/%s_dict.pkl'%target_name) #print("Done! Saved dictionary under ./target_dict/ as %s_dict.pkl"%target_name) vocab.save_dictionary(worddict, wordcount, './%s/%s_dict.pkl' % (target_name, target_name)) print("Done! Saved dictionary under ./%s/ as %s_dict.pkl" % (target_name, target_name)) """ Step 2: Generating style vector for the target text. """ print("Generating style vector for the target text...") nltk.download( 'punkt') # Natural Language Toolkit for skipthoughts encoder. print("The lenth of X is:") print len(X) skip_vector = skipthoughts.encode(skmodel, X) style_vector = skip_vector.mean( 0 ) # 0 indicate that mean method is performed over multiple axes, see numpy.mean document #np.save('./target_style/%s_style.npy'%target_name, style_vector)
""" all_sent = [] for txt_file in flist_txt: print "Reading file: {}".format(txt_file) with open(txt_file, 'r') as f: data = f.read() sent = data.split('\n') all_sent += sent print "File loading complete. Cleaning..." #all_sent = map(clean_string, all_sent) return all_sent if __name__ == "__main__": os.environ["THEANO_FLAGS"] = "floatX=float32" file_names = get_file_list(data_path, ['txt']) train_sent = load_txt_sent(file_names) if not os.path.exists(dict_path): print "Dictionary not found, recreating" worddict, wordcount = vocab.build_dictionary(train_sent) print "Built. Saving to: {}".format(dict_path) vocab.save_dictionary(worddict, wordcount, dict_path) else: print "Found dictionary at {}... Loading...".format(dict_path) worddict = vocab.load_dictionary(dict_path) print "Beginning Training..." train.trainer(train_sent, n_words=20000, dim=2400, batch_size=128, reload_=False, dictionary=dict_path, saveto=save_path)
# coding: utf-8 import vocab import train import tools import numpy as np with open("../../wikipedia_txt/result_wakati.txt") as f: fdata = [line.rstrip() for i, line in enumerate(f)] print '# lines: ', len(fdata) worddict, wordcount = vocab.build_dictionary(fdata) vocab.save_dictionary(worddict, wordcount, "word_dict") print '# vocab: ', len(worddict) train.trainer(fdata, dictionary="word_dict", saveFreq=100, saveto="model", reload_=True, n_words=40000) model = tools.load_model() vectors = tools.encode(model, fdata, use_norm=False) np.savez('vecs.npz', vectors)
for f in FILES: file_counter += 1 with open(f) as file_descriptor: file_content = file_descriptor.read().decode("utf-8", "ignore") file_content = sent_tokenize(file_content) for sentence in file_content: if sentence: X.append(sentence.strip()) sentence_embeddings = np.empty([file_counter, 4800]) loc = base_path_to_directory + "dictionary.pkl" saveto = base_path_to_directory + "toy.npz" maxlen_w = 70 worddict, wordcount = vocab.build_dictionary(X) vocab.save_dictionary(worddict, wordcount, loc) #loc where you want to save dictionary #in train.py set 1>path for dictionary, 2>save_to -path where to save model 3>maxlen_w train.trainer(X, dictionary=loc, saveto=saveto, maxlen_w=maxlen_w) #In tools.py set path_to_model=save_to in train, path_to_dictionary=dictionary in train and path_to_word2vec. embed_map = tools.load_googlenews_vectors(path_to_word2vec) model = tools.load_model(embed_map) if not os.path.exists(SENTENCE_EMBEDDING_FOLDER): os.mkdir(SENTENCE_EMBEDDING_FOLDER) for f in FILES: with open(f) as file_descriptor: file_content = sent_tokenize(file_descriptor.read()) document_embedding = tools.encode(model, file_content, verbose=False) document_embedding = np.average(document_embedding, axis=0) file_name = f.split('/')[-1]
max_w = 10 saveF = 1000 batch = 128 clen = 6 if not reload_: # load the data and put in list f = open(data_path, 'r') X = f.read().splitlines() # preprocess X = preprocess.prepareentitylist(X, stop_path, clen) # store for future f = open(proc_data_path,'w') for item in X: f.write('%s\n' % item) else: f = open(proc_data_path,'r') X = f.read().splitlines() # subset X = X[:N] # build dictionary worddict, wordcount = vocab.build_dictionary(X) vocab.save_dictionary(worddict, wordcount, dict_path) # train train.trainer(X, saveto=out_path, dictionary=dict_path, saveFreq=saveF, max_epochs=max_e, dispFreq=dispF, maxlen_w=max_w, batch_size=batch)
import vocab import nltk X = [] file = open("/home/jm7432/big/Romance/romance-final.txt", "r") for line in file: if len(line.strip()) > 0 and ("chapter" not in line or "part" not in line): X.append(line) l = len(X) Y = X[50:l - 50] worddict, wordcount = vocab.build_dictionary(Y) vocab.save_dictionary( worddict, wordcount, '/home/jm7432/tell-tall-tales/decoding/romance_dict_final.pkl')