file = expdir + "/" + file os.remove(file) os.removedirs(expdir) print('ERROR: expdir already exists') # exit(-1) # tf.set_random_seed(int(time.time() * 1000)) tf.compat.v1.set_random_seed(int(time.time() * 1000)) params = helper.GetParams(args.params, 'train', args.expdir) logging.basicConfig(filename=os.path.join(expdir, 'logfile.txt'), level=logging.INFO) logging.getLogger().addHandler(logging.StreamHandler()) df = LoadData(args.data) char_vocab = Vocab.MakeFromData(df.query_, min_count=10) char_vocab.Save(os.path.join(args.expdir, 'char_vocab.pickle')) params.vocab_size = len(char_vocab) user_vocab = Vocab.MakeFromData([[u] for u in df.user], min_count=15) user_vocab.Save(os.path.join(args.expdir, 'user_vocab.pickle')) params.user_vocab_size = len(user_vocab) dataset = Dataset(df, char_vocab, user_vocab, max_len=params.max_len, batch_size=params.batch_size) val_df = LoadData(args.valdata) valdata = Dataset(val_df, char_vocab,
if __name__ == '__main__': optimizer = { 'sgd': tf.train.GradientDescentOptimizer, 'adam': tf.train.AdamOptimizer, 'ada': tf.train.AdagradOptimizer, 'adadelta': tf.train.AdadeltaOptimizer }[args.optimizer] mLow = DynamicModel(args.expdir, learning_rate=args.learning_rate, threads=args.threads, optimizer=optimizer) df = LoadData(args.data) users = df.groupby('user') avg_time = MovingAvg(0.95) stop = '</S>' # decide if we stop at first space or not if args.partial: stop = ' ' counter = 0 for user, grp in users: grp = grp.sort_values('date') mLow.session.run(mLow.reset_user_embed) for i in range(len(grp)): row = grp.iloc[i] query = ''.join(row.query_[1:-1])
def main(): args = get_cmd_args() args.data_path = args.directory + 'PKL/{}_train_clean.pickle'.format( args.dataset) args.target_path = args.directory + 'PKL/{}_clean_data_indexs.pickle'.format( args.dataset) args.vocab_path = args.directory + 'PKL/{}_clean_vocab.pickle'.format( args.dataset) args.model_dir = args.directory + 'Model/char/torch/{}.model'.format( args.model_name) log_file = args.directory + 'log/' + args.model_name + '_debug.log' logging.basicConfig(filename=log_file, filemode='w', level=logging.DEBUG) # test loaddata = LoadData(args.batch, args.data_path, args.target_path, args.vocab_path, args.tokenizer, args.n_features) args.inp_dim = args.out_dim = len(loaddata.vocab) args.max_len = loaddata.max_length args.vocab = loaddata.vocab task = Train(args.inp_dim, args.out_dim, args.embedding_dim, args.enc_units, args.dec_units, args.dropout, args.dropout, args.epoch, args.clip, args.sparse_max, args.tf, args.max_len, args.vocab, args.batch, device) if args.mode == 'train': logging.info('start training...') results = task.start_train(loaddata.train, loaddata.valid, args.model_dir) for k, v in results.items(): print('{0}: {1}'.format(k, v)) logging.info('{0}: {1}'.format(k, v)) extension = '_' + str(args.model_name) save_path = args.directory + 'results/' # plot accuracy plot('accuracy' + extension, 'epochs', 'accuracy', results['train_acc'], results['val_acc'], 'train accuracy', 'validation accuracy', save_path=save_path) # plot loss plot('loss' + extension, 'epochs', 'loss', results['train_loss'], results['val_loss'], 'train loss', 'validation loss', save_path=save_path) # plot wer plot('wer' + extension, 'epochs', 'wer', results['wer_ocr'], results['wer_after'], 'wer ocr', 'val wer', save_path=save_path) # plot cer plot('cer' + extension, 'epochs', 'cer', results['cer_ocr'], results['cer_after'], 'cer ocr', 'val cer', save_path=save_path) else: logging.info('start testing...') # sent_clean = 'Mohren plagen uns ohne aufhörlich' # sent_res = task.translate_sent(loaddata, sent_clean) sent_out = task.test(loaddata, loaddata.valid, args.model_dir) output_file = args.directory + 'log/' + args.model_name + '_output.txt' print('Saving to {0}\n'.format(output_file)) with open(output_file, 'w', encoding='utf-8') as f: for sent_pair in sent_out: f.write(sent_pair[0] + ',' + sent_pair[1] + '\n')
import pygtrie import re import sys import numpy as np from dataset import LoadData from helper import GetPrefixLen query_trie = pygtrie.CharTrie() dirname = '../data' filenames = [ 'queries01.train.txt.gz', 'queries02.train.txt.gz', 'queries03.train.txt.gz', 'queries04.train.txt.gz', 'queries05.train.txt.gz', 'queries06.train.txt.gz' ] df = LoadData([os.path.join(dirname, f) for f in filenames], split=False) z = df.query_.value_counts() z = z[z > 2] for q, count in zip(z.index.values, z): query_trie[q] = count cache = {} def GetTopK(prefix, k=100): if prefix in cache: return cache[prefix] results = query_trie.items(prefix) queries, counts = zip(*sorted(results, key=lambda x: -x[-1])) cache[prefix] = queries[:k]
dest='data', default=[data_dir + "queries07.test.txt.gz"], help='where to load the data') parser.add_argument('--threads', type=int, default=12, help='how many threads to use in tensorflow') args = parser.parse_args() expdir = args.expdir # 模型加载 metamodel = MetaModel(expdir) model = metamodel.model metamodel.MakeSessionAndRestore(args.threads) # 数据加载 df = LoadData(args.data) dataset = Dataset(df, metamodel.char_vocab, metamodel.user_vocab, max_len=metamodel.params.max_len) total_word_count = 0 total_log_prob = 0 print(len(dataset.df), dataset.batch_size) # 20999 24 for idx in range(0, int(len(dataset.df) / dataset.batch_size)): feed_dict = dataset.GetFeedDict(model) # 这里的session 是 获取的是 保存后的模型 c, words_in_batch = metamodel.session.run( [model.avg_loss, model.words_in_batch], feed_dict) # c是 total_loss, words_in_batch 一个batch里字数 total_word_count += words_in_batch # 整个字数
def main(): args = get_cmd_args() Train_groups, Test_groups = load_corpus(args) group_num = len(Test_groups) for flag in range(group_num): print('Experiment: {}'.format(flag)) args.model_dir = args.directory + 'Model/{}{}.model'.format( args.model_name, flag) log_file = args.directory + 'log/{}{}.log'.format( args.model_name, flag) logging.basicConfig(filename=log_file, filemode='w', level=logging.DEBUG) Train_data = Train_groups[flag] Test_data = Test_groups[flag] loaddata = LoadData(args.tokenizer, args.n_features) loaddata.prepare_corpus(Train_data, Test_data) args.inp_dim = args.out_dim = len(loaddata.vocab) args.max_len = loaddata.max_length vocab = loaddata.vocab # get white space index # vocab_reverse = loaddata.vocab_reverse print(args.inp_dim, args.max_len, vocab.get(' ')) # print(type(list(vocab.keys())[10])) task = Train(args.inp_dim, args.out_dim, args.embedding_dim, args.enc_units, args.dec_units, args.dropout, args.dropout, args.epoch, args.clip, args.sparse_max, args.tf, loaddata, args.batch, device, args.model_dir) if args.mode == 'train': logging.info('start training...') task.start_train(loaddata.train, loaddata.valid) # also test logging.info('start testing themselves: ') task.test_in_batch(loaddata.test) # test other books test_others = [i for i in range(group_num) if i != flag] for j in test_others: logging.info('start testing other books: {}'.format(j)) Test_other = Test_groups[j] test_data = loaddata.prepare_other_corpus(Test_other) task.test_in_batch(test_data) else: logging.info('start testing...') task.test_in_batch(loaddata.test) test_others = [i for i in range(group_num) if i != flag] for j in test_others: logging.info('start testing other books: {}'.format(j)) Test_other = Test_groups[j] test_data = loaddata.prepare_other_corpus(Test_other) task.test_in_batch(test_data) for test in Test_groups: test_inp = [t[0] for t in test] test_out = [t[1] for t in test] translation = task.translate_in_batch(test_inp) out = args.directory + 'log/test_text_model{}.txt'.format(flag) with open(out, 'a', encoding='utf8') as f: for inp, pred, truth in zip(test_inp, translation, test_out): f.write(inp.decode(errors='ignore')) f.write('\n') f.write(pred) f.write('\n') f.write(truth.decode(errors='ignore')) f.write('\n\n')
def loaddata(self,type): self.trainset = LoadData(type, transform=None) self.load_data = DataLoader( self.trainset, batch_size=self.batch_size, shuffle=True)
import pygtrie import sys from dataset import LoadData from helper import GetPrefixLen import code query_trie = pygtrie.CharTrie() dirname = '/g/ssli/data/LowResourceLM/aol' filenames = [ 'queries01.train.txt.gz', 'queries02.train.txt.gz', 'queries03.train.txt.gz', 'queries04.train.txt.gz', 'queries05.train.txt.gz', 'queries06.train.txt.gz' ] df = LoadData([os.path.join(dirname, f) for f in filenames], split=False) z = df.query_.value_counts() z = z[z > 2] for q, count in zip(z.index.values, z): query_trie[q] = count cache = {} def GetTopK(prefix, k=100): if prefix in cache: return cache[prefix] results = query_trie.items(prefix) queries, counts = zip(*sorted(results, key=lambda x: -x[-1])) cache[prefix] = queries[:k]