def main(unused_argv): config = importlib.import_module('config.%s' % FLAGS.config) for argument in FLAGS.override.split(','): if '=' in argument: name = argument.split('=')[0] value = type(getattr(config, name))(argument.split('=')[1]) setattr(config, name, value) config.input_vocab = data.Vocab(config.input_vocab_file, config.max_vocab_size) # Max IDs if config.input_vocab.WordToId(data.PAD_TOKEN) <= 0: raise ValueError('Invalid PAD_TOKEN id.') # id of the UNKNOWN_TOKEN should be "0" for copynet model if config.input_vocab.WordToId(data.UNKNOWN_TOKEN) != 0: raise ValueError('Invalid UNKOWN_TOKEN id.') if config.input_vocab.WordToId(data.SENTENCE_START) <= 0: raise ValueError('Invalid SENTENCE_START id.') if config.input_vocab.WordToId(data.SENTENCE_END) <= 0: raise ValueError('Invalid SENTENCE_END id.') if config.output_vocab_file: config.output_vocab = data.Vocab(config.output_vocab_file, config.max_vocab_size) # Max IDs if config.output_vocab.WordToId(data.PAD_TOKEN) <= 0: raise ValueError('Invalid PAD_TOKEN id.') # id of the UNKNOWN_TOKEN should be "0" for copynet model if config.output_vocab.WordToId(data.UNKNOWN_TOKEN) != 0: raise ValueError('Invalid UNKOWN_TOKEN id.') if config.output_vocab.WordToId(data.SENTENCE_START) <= 0: raise ValueError('Invalid SENTENCE_START id.') if config.output_vocab.WordToId(data.SENTENCE_END) <= 0: raise ValueError('Invalid SENTENCE_END id.') else: config.output_vocab = config.input_vocab train_batcher = config.Batcher(config.train_set, config) valid_batcher = config.Batcher(config.valid_set, config) tf.set_random_seed(config.random_seed) if FLAGS.mode == 'train': model = config.Model(config, 'train', num_gpus=FLAGS.num_gpus) _Train(model, config, train_batcher) elif FLAGS.mode == 'eval': config.dropout_rnn = 1.0 config.dropout_emb = 1.0 model = config.Model(config, 'eval', num_gpus=FLAGS.num_gpus) _Eval(model, config, valid_batcher) elif FLAGS.mode == 'decode': config.dropout_rnn = 1.0 config.dropout_emb = 1.0 config.batch_size = config.beam_size model = config.Model(config, 'decode', num_gpus=FLAGS.num_gpus) decoder = decode.BeamSearch(model, valid_batcher, config) decoder.DecodeLoop()
def test_batcher(): max_enc_steps = 65 max_dec_steps = 65 word_count_path = '/home/jiananwang/rl-QG/data/squad-v1/word_counter.json' glove_path = '/home/jiananwang/data/glove/glove.840B.300d.txt' embed_dim = 300 max_vocab_size = 50000 embedding_dict_file = '/home/jiananwang/rl-QG/data/squad-v1/emb_dict_%d.pkl' % max_vocab_size vocab = data.Vocab(word_count_path, glove_path, embed_dim, max_vocab_size, embedding_dict_file) data_path = '/home/jiananwang/rl-QG/data/squad-v1/train_raw.json' batch_size = 5 dynamic_vocab = False batcher = Batcher(data_path, vocab, batch_size, max_enc_steps, max_dec_steps, mode='train', dynamic_vocab=dynamic_vocab) batcher.setup() while True: try: start = time.time() batch = batcher.next_batch() print('time:', time.time()-start) except: break
def __init__(self, vocab_path, ckpt_path): self._num_gpus = 0 self._vocab_path = vocab_path self._ckpt_path = ckpt_path self._vocab = data.Vocab(self._vocab_path, 50000) #1000000 # Check for presence of required special tokens. assert self._vocab.WordToId(data.PAD_TOKEN) > 0 assert self._vocab.WordToId(data.UNKNOWN_TOKEN) >= 0 assert self._vocab.WordToId(data.SENTENCE_START) > 0 assert self._vocab.WordToId(data.SENTENCE_END) > 0 self._decode_hps = seq2seq_attention_model.HParams( mode='decode', # train, eval, decode min_lr=0.01, # min learning rate. lr=0.15, # learning rate batch_size=4, enc_layers=4, enc_timesteps=120, dec_timesteps=30, min_input_len=2, # discard articles/summaries < than this num_hidden=256, # for rnn cell emb_dim=128, # If 0, don't use embedding max_grad_norm=2, num_softmax_samples=4096) # If 0, no sampled softmax. self._hps = self._decode_hps._replace(dec_timesteps=1) print "=== Initilizaing... ===" self._model = seq2seq_attention_model.Seq2SeqAttentionModel( self._hps, self._vocab, num_gpus=self._num_gpus) print "=== Finish Initilizaing ===" self._decoder = seq2seq_attention_decode.BSDecoder( self._model, self._decode_hps, self._vocab, self._ckpt_path) print "==== Can Start to Answer the Question Now!!!!! ===="
def _extract_we_binary(output_file, vocab_file, we_dic): vocab = data.Vocab(vocab_file, 1000000) vsize = vocab.NumIds() output = codecs.open(output_file, "w", "utf-8") unknown_ids = [vocab.WordToId(UNKNOWN_TOKEN)] with open(we_dic, "rb") as f: header = f.readline() vocab_size, layer1_size = map(int, header.split()) binary_len = np.dtype('float32').itemsize * layer1_size print "layer1_size:", layer1_size for line in xrange(vocab_size): word = [] while True: ch = f.read(1) if ch == ' ': word = ''.join(word) break if ch != '\n': word.append(ch) idx = data.GetWordIds(word, vocab) if idx != None and idx != unknown_ids and word == "<s>": print idx, ":", word output.write(word + ' ' + ' '.join( map(str, np.fromstring(f.read(binary_len), dtype='float32'))) + '\n') elif idx == unknown_ids: f.read(binary_len) else: f.read(binary_len) f.close() output.close()
def main(unused_argv): vocab = data.Vocab(FLAGS.vocab_path, 1000000) # Check for presence of required special tokens. assert vocab.WordToId(data.PAD_TOKEN) > 0 assert vocab.WordToId(data.UNKNOWN_TOKEN) >= 0 assert vocab.WordToId(data.SENTENCE_START) > 0 assert vocab.WordToId(data.SENTENCE_END) > 0 batch_size = 1 if FLAGS.mode == 'decode': batch_size = FLAGS.beam_size hps = seq2seq_attention_model.HParams( mode=FLAGS.mode, # train, eval, decode min_lr=0.01, # min learning rate. lr=0.15, # learning rate batch_size=batch_size, #enc_layers=4, enc_layers=2, enc_timesteps=60, #enc_timesteps=120, #dec_timesteps=30, dec_timesteps=15, min_input_len=2, # discard articles/summaries < than this num_hidden=128, # for rnn cell #num_hidden=256, # for rnn cell emb_dim=128, # If 0, don't use embedding max_grad_norm=2, num_softmax_samples=10) # If 0, no sampled softmax. #num_softmax_samples=4096) # If 0, no sampled softmax. batcher = batch_reader.Batcher(FLAGS.data_path, vocab, hps, FLAGS.article_key, FLAGS.abstract_key, FLAGS.max_article_sentences, FLAGS.max_abstract_sentences, bucketing=FLAGS.use_bucketing, truncate_input=FLAGS.truncate_input) tf.set_random_seed(FLAGS.random_seed) if hps.mode == 'train': model = seq2seq_attention_model.Seq2SeqAttentionModel( hps, vocab, num_gpus=FLAGS.num_gpus) _Train(model, batcher) elif hps.mode == 'eval': model = seq2seq_attention_model.Seq2SeqAttentionModel( hps, vocab, num_gpus=FLAGS.num_gpus) _Eval(model, batcher, vocab=vocab) elif hps.mode == 'decode': decode_mdl_hps = hps # Only need to restore the 1st step and reuse it since # we keep and feed in state for each step's output. decode_mdl_hps = hps._replace(dec_timesteps=1) model = seq2seq_attention_model.Seq2SeqAttentionModel( decode_mdl_hps, vocab, num_gpus=FLAGS.num_gpus) decoder = seq2seq_attention_decode.BSDecoder(model, batcher, hps, vocab) decoder.DecodeLoop()
def __init__(self, hp, model_settings, extra_info, mode='decode'): vocab_file = hp.vocab_path max_size = hp.vocab_size self.vocab = data.Vocab( vocab_file=vocab_file, max_size=max_size) # Construct the vocabulary manager self.model = model.SummarizationModel( hps=model_settings, vocab=self.vocab, extra_info=extra_info) # Construct the model self.decode_wrapper = None
def test_example(): #batcher = Batcher(train_file, vocab) #batch = batcher.next_batch() #batch.enc_batch #batch.dec_batch #batch.target_batch max_enc_steps = 65 max_dec_steps = 65 word_count_path = '/home/jiananwang/rl-QG/data/squad-v1/word_counter.json' glove_path = '/home/jiananwang/data/glove/glove.840B.300d.txt' embed_dim = 300 max_vocab_size = 50000 embedding_dict_file = '/home/jiananwang/rl-QG/data/squad-v1/emb_dict_50000.pkl' vocab = data.Vocab(word_count_path, glove_path, embed_dim, max_vocab_size, embedding_dict_file) with open('/home/jiananwang/rl-QG/data/squad-v1/dev_raw.json') as f: d = json.load(f) for ex in d: if ex['ifkeep']: para = ex['correct_sentence'] ques = ex['question'] ans = ex['valid_answer'][0] ans_pos = (ex['ans_start_in_sent'], ex['ans_end_in_sent']) case = Example(para, ques, ans, ans_pos, vocab, max_enc_steps, max_dec_steps, dynamic_vocab=True) print('enc len:', case.enc_len) #if not case.dynamic_vocab: print('enc input:', case.enc_input) print('decoding:', ' '.join([vocab.id2word(i) for i in case.enc_input])) print('dec len:', case.dec_len) print('dec input:', case.dec_input) print('decoding:', ' '.join([vocab.id2word(i) for i in case.dec_input])) if not case.dynamic_vocab: print('target:', case.target) print('decoding:', ' '.join([vocab.id2word(i) for i in case.target])) print('orig para:', case.original_paragraph) print('orig ques:', case.original_question) print('orig ans:', case.original_answer) print('ans start:', case.answer_start_idx) print('ans end:', case.answer_end_idx) vocab_size = max_vocab_size + 4 if case.dynamic_vocab: print('-'*10, 'dynamic vocab', '-'*10) print('enc input extend vocab:', case.enc_input_extend_vocab) words = decoding(case.enc_input_extend_vocab, vocab, case.enc_oovs) print('decoding:', ' '.join(words)) print('new dec input:', case.dec_input_extend_vocab) words = decoding(case.dec_input_extend_vocab, vocab, case.enc_oovs) print('decoding:', ' '.join(words)) print('new target:', case.target) words = decoding(case.target, vocab, case.enc_oovs) print('decoding:', ' '.join(words)) break
def editnet_data_to_editnetID(df, output_path): """ this function reads from df.columns=['comp_tokens', 'simp_tokens', 'edit_labels','comp_pos_tags','comp_pos_ids'] and add vocab ids for comp_tokens, simp_tokens, and edit_labels :param df: df.columns=['comp_tokens', 'simp_tokens', 'edit_labels','comp_pos_tags','comp_pos_ids'] :param output_path: the path to store the df :return: a dataframe with df.columns=['comp_tokens', 'simp_tokens', 'edit_labels', 'comp_ids','simp_id','edit_ids', 'comp_pos_tags','comp_pos_ids']) """ out_list = [] vocab = data.Vocab() vocab.add_vocab_from_file('./vocab_data/vocab.txt', 30000) def prepare_example(example, vocab): """ :param example: one row in pandas dataframe with feild ['comp_tokens', 'simp_tokens', 'edit_labels'] :param vocab: vocab object for translation :return: inp: original input sentence, """ comp_id = np.array([ vocab.w2i[i] if i in vocab.w2i.keys() else vocab.w2i[UNK] for i in example['comp_tokens'] ]) simp_id = np.array([ vocab.w2i[i] if i in vocab.w2i.keys() else vocab.w2i[UNK] for i in example['simp_tokens'] ]) edit_id = np.array([ vocab.w2i[i] if i in vocab.w2i.keys() else vocab.w2i[UNK] for i in example['edit_labels'] ]) return comp_id, simp_id, edit_id # add a dimension for batch, batch_size =1 for i, example in df.iterrows(): print(i) comp_id, simp_id, edit_id = prepare_example(example, vocab) ex = [ example['comp_tokens'], comp_id, example['simp_tokens'], simp_id, example['edit_labels'], edit_id, example['comp_pos_tags'], example['comp_pos_ids'] ] out_list.append(ex) outdf = pd.DataFrame(out_list, columns=[ 'comp_tokens', 'comp_ids', 'simp_tokens', 'simp_ids', 'edit_labels', 'new_edit_ids', 'comp_pos_tags', 'comp_pos_ids' ]) outdf.to_pickle(output_path) print('saved to %s' % output_path) return outdf
def test_batch(): max_enc_steps = 65 max_dec_steps = 65 word_count_path = '/home/jiananwang/rl-QG/data/squad-v1/word_counter.json' glove_path = '/home/jiananwang/data/glove/glove.840B.300d.txt' embed_dim = 300 max_vocab_size = 50000 embedding_dict_file = '/home/jiananwang/rl-QG/data/squad-v1/emb_dict_50000.pkl' vocab = data.Vocab(word_count_path, glove_path, embed_dim, max_vocab_size, embedding_dict_file) dynamic_vocab =True with open('/home/jiananwang/rl-QG/data/squad-v1/dev_raw.json') as f: d = json.load(f) example_list = [] for ex in d: if ex['ifkeep']: para = ex['correct_sentence'] ques = ex['question'] ans = ex['valid_answer'][0] ans_pos = (ex['ans_start_in_sent'], ex['ans_end_in_sent']) case = Example(para, ques, ans, ans_pos, vocab, max_enc_steps, max_dec_steps, dynamic_vocab) example_list.append(case) if len(example_list) == 5: break batch = Batch(example_list, vocab, max_dec_steps, dynamic_vocab) print('enc batch:', batch.enc_batch) if not dynamic_vocab: for i in range(batch.enc_batch.shape[0]): enc = batch.enc_batch[i] dec = batch.dec_batch[i] tgt = batch.target_batch[i] words = decoding(enc.tolist(), vocab) print('enc:', ' '.join(words)) words = decoding(dec.tolist(), vocab) print('dec:', ' '.join(words)) words = decoding(tgt.tolist(), vocab) print('tgt:', ' '.join(words)) print('-'*20) else: for i in range(batch.enc_batch.shape[0]): enc = batch.enc_batch_extend_vocab[i] dec = batch.dec_batch[i] tgt = batch.target_batch[i] oov = batch.para_oovs_batch[i] words = decoding(enc.tolist(), vocab, oov) print('enc one case:', ' '.join(words)) words = decoding(dec.tolist(), vocab, oov) print('dec one case:', ' '.join(words)) words = decoding(tgt.tolist(), vocab, oov) print('tgt one case:', ' '.join(words)) print('-'*20)
def __init__(self, hp, model_settings, extra_info): vocab_file=hp.vocab_path max_size=hp.vocab_size self.vocab=data.Vocab(vocab_file=vocab_file, max_size=max_size) # Construct the vocabulary manager # model_hp_list=['mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', # 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'coverage', 'cov_loss_wt', 'pointer_gen'] # model_hp_dict={} # for key,value in FLAGS.__flags.iteritems(): # if key in model_hp_list: # model_hp_dict[key]=value # model_settings=namedtuple('HParams',model_hp_dict.keys())(**model_hp_dict) self.model=model.SummarizationModel(hps=model_settings, vocab=self.vocab, extra_info=extra_info) # Construct the model self.decode_wrapper=None
def __init__(self, config): self.corpus_files = config["corpus_files"] # self.jamo_processor = Han2Jamo() # self.char_vocab = CharWordVocab.load_vocab(config["char_vocab_path"]) self.char_vocab = data.CharWordVocab(config["char_vocab_path"]) # self.word_vocab = WordVocab.load_vocab(config["word_vocab_path"]) self.word_vocab = data.Vocab(config["word_vocab_path"]) self.seq_len = config["word_seq_len"] self.char_seq_len = config["char_seq_len"] self.corpus_size = self.get_corpus_size() print("DataSet Size:", self.corpus_size) config["char_vocab_size"] = len(self.char_vocab) config["word_vocab_size"] = len(self.word_vocab)
def main(): from torch.nn import init config = util.read_config('../configs/process.yaml') # get_temp_vocab(config) vocab = data.Vocab(config.vocab_path, max_size=config.max_size) # vocab.build_vectors(config.pre_word_embedding_path, 300, unk_init=init.xavier_uniform) if config.save: torch.save(vocab, config.vocab_path_50) val_data = DocDataset(config.val_path, vocab, config) test_data = DocDataset(config.test_path, vocab, config) if config.save: torch.save(val_data, config.val_data_path) torch.save(test_data, config.test_data_path) train_data = DocDataset(config.train_path, vocab, config) if config.save: torch.save(train_data, config.train_data_path)
def run(args): torch.set_default_dtype(torch.float64) vocab = data.Vocab() if args.generate_data: generate_train_val_test(args.generate_num,vocab,0.7,0.2,args.generate_path) return batch_size = args.batch_size if args.load_data == True: data_path = args.data_path with open(data_path,'rb') as f: train_questions,train_ans,val_questions,val_ans,test_questions,test_ans = pkl.load(f) train_generator = data.BatchGenerator(train_questions, train_ans, batch_size) val_generator = data.BatchGenerator(val_questions, val_ans, batch_size) lr = float(args.lr) or 0.01 num_layers = 1 if args.load_model == True: model_path = args.model_path checkpoint = torch.load(model_path) rnn = RNNCalc(*checkpoint['model_hyper']) rnn.load_state_dict(checkpoint['model']) optimizer = torch.optim.SGD(rnn.parameters(),lr=lr,momentum=0.9, nesterov=True) #optimizer = optim.Adam(rnn.parameters()) optim.Adam(rnn.parameters()).load_state_dict(checkpoint['optimzer']) else: #create new model embedding_dim,vocab_size,rnn_units = 32,vocab .size(),128 rnn = RNNCalc(num_layers,embedding_dim,vocab_size,rnn_units) optimizer = optim.Adam(rnn.parameters(),lr=lr) if args.mode == 'train': assert optimizer is not None trainer = Trainer(rnn, optimizer ,args.epoch_num) trainer.train( train_generator,val_generator,vocab,lr,10,10,args.checkpoint_path) elif args.mode=='test': eva_generator = data.BatchGenerator(train_questions, train_ans, len(train_questions)) evaluate.evaluate(rnn, eva_generator, vocab)
def train(cls): cls.vocab = data.Vocab(FLAGS.vocab_path, 1000000) batch_size = FLAGS.beam_size hps = seq2seq_attention_model.HParams( mode=FLAGS.mode, # train, eval, decode min_lr=0.01, # min learning rate. lr=0.15, # learning rate batch_size=batch_size, enc_layers=4, enc_timesteps=120, #120 dec_timesteps=120, #30 min_input_len=0, # discard articles/summaries < than this num_hidden=256, # for rnn cell emb_dim=128, # If 0, don't use embedding max_grad_norm=2, num_softmax_samples=0) # 4096,If 0, no sampled softmax. cls.batcher = Batcher(cls.vocab, hps, FLAGS.article_key, FLAGS.abstract_key, FLAGS.max_article_sentences, FLAGS.max_abstract_sentences, bucketing=FLAGS.use_bucketing, truncate_input=FLAGS.truncate_input) tf.set_random_seed(FLAGS.random_seed) # Only need to restore the 1st step and reuse it since # we keep and feed in state for each step's output. decode_mdl_hps = hps._replace(dec_timesteps=1) model = seq2seq_attention_model.Seq2SeqAttentionModel( decode_mdl_hps, cls.vocab, num_gpus=FLAGS.num_gpus) cls.decoder = seq2seq_attention_decode.BSDecoder( model, cls.batcher, hps, cls.vocab) #载入模型 cls.sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root) if not (ckpt_state and ckpt_state.model_checkpoint_path): print('No model to decode yet at %s' % (FLAGS.log_root)) tf.logging.info('checkpoint path %s', ckpt_state.model_checkpoint_path) ckpt_path = os.path.join( FLAGS.log_root, os.path.basename(ckpt_state.model_checkpoint_path)) tf.logging.info('renamed checkpoint path %s', ckpt_path) cls.decoder._saver.restore(cls.sess, ckpt_path)
def run(args): vocab = data.Vocab() if args.generate_data: generate_train_val_test(args.generate_num, vocab, 0.7, 0.2, args.generate_path) return batch_size = args.batch_size if args.load_data == True: data_path = args.data_path with open(data_path, 'rb') as f: train_questions, train_ans, val_questions, val_ans, test_questions, test_ans = pkl.load( f) train_generator = data.BatchGenerator(train_questions, train_ans, batch_size) val_generator = data.BatchGenerator(val_questions, val_ans, batch_size) lr = 0.01 if args.load_model == True: model_path = args.model_path checkpoint = torch.load(model_path) seq2seq = Seq2seqCalc(*checkpoint['model_hyper']) seq2seq.load_state_dict(checkpoint['model']) optimizer = optim.Adam(seq2seq.parameters()) optim.Adam(seq2seq.parameters()).load_state_dict( checkpoint['optimzer']) else: #create new model embedding_dim, vocab_size, digit_rnn_units, decoder_rnn_units = 32, vocab.size( ), 256, 128 optimizer = optim.Adam(seq2seq.parameters(), lr=lr) seq2seq = Seq2seqCalc(embedding_dim, vocab_size, digit_rnn_units, decoder_rnn_units) if args.mode == 'train': assert optimizer is not None trainer = Trainer(seq2seq, optimizer, args.epoch_num) trainer.train(train_generator, val_generator, vocab, lr, 10, 10, args.checkpoint_path) elif args.mode == 'test': eva_generator = data.BatchGenerator(train_questions, train_ans, len(train_questions)) evaluate.evaluate(seq2seq, eva_generator, vocab)
def _extract_we_text(output_file, vocab_file, we_dic): vocab = data.Vocab(vocab_file, 1000000) vsize = vocab.NumIds() m = copy.deepcopy(vocab._word_to_id) unknown_ids = [vocab.WordToId(UNKNOWN_TOKEN)] output = codecs.open(output_file, "w", "utf-8") with open(we_dic, "rb") as f: for line in f: string = line.split(" ") word = string[0].strip() value = " ".join(x for x in string[1:]) idx = data.GetWordIds(word, vocab) if idx != None and idx != unknown_ids and word in m: del m[word] output.write(word + ' ' + value) print "====:", m print "---:", len(m) f.close() output.close() #this operation wants to garuantee that words in WE and words in vocab file must be the same del m['<s>'] del m['</s>'] del m['<d>'] del m['</d>'] del m['<p>'] del m['</p>'] tt = m.keys() vocab_new = vocab_file + "_new" with open(vocab_file, 'r') as f: with open(vocab_new, 'w') as g: for line in f.readlines(): if all(string not in line for string in tt): g.write(line) if '<UNK>' in m: g.write('<UNK> 0\n') if '<PAD>' in m: g.write('<PAD> 0\n') shutil.move(vocab_new, vocab_file) f.close() g.close()
def main(unused_argv): vocab = data.Vocab(FLAGS.vocab_path, 1000000) # Check for presence of required special tokens. assert vocab.CheckVocab(data.PAD_TOKEN) > 0 assert vocab.CheckVocab(data.UNKNOWN_TOKEN) >= 0 assert vocab.CheckVocab(data.SENTENCE_START) > 0 assert vocab.CheckVocab(data.SENTENCE_END) > 0 batch_size = 64 if FLAGS.mode == 'decode': batch_size = FLAGS.beam_size hps = seq2seq_attention_model.HParams( mode=FLAGS.mode, # train, eval, decode min_lr=0.01, # min learning rate. lr=0.15, # learning rate batch_size=batch_size, enc_layers=4, enc_timesteps=120, dec_timesteps=30, min_input_len=2, # discard articles/summaries < than this num_hidden=256, # for rnn cell emb_dim=128, # If 0, don't use embedding max_grad_norm=2, num_softmax_samples=4096) # If 0, no sampled softmax. eval_hps = seq2seq_attention_model.HParams( mode='eval', # train, eval, decode min_lr=0.01, # min learning rate. lr=0.15, # learning rate batch_size=batch_size, enc_layers=4, enc_timesteps=120, dec_timesteps=30, min_input_len=2, # discard articles/summaries < than this num_hidden=256, # for rnn cell emb_dim=128, # If 0, don't use embedding max_grad_norm=2, num_softmax_samples=4096) # If 0, no sampled softmax. batcher = batch_reader.Batcher( FLAGS.data_path, vocab, hps, FLAGS.article_key, FLAGS.abstract_key, FLAGS.max_article_sentences, FLAGS.max_abstract_sentences, bucketing=FLAGS.use_bucketing, truncate_input=FLAGS.truncate_input) eval_batcher = batch_reader.Batcher( FLAGS.eval_data_path, vocab, eval_hps, FLAGS.article_key, FLAGS.abstract_key, FLAGS.max_article_sentences, FLAGS.max_abstract_sentences, bucketing=FLAGS.use_bucketing, truncate_input=FLAGS.truncate_input) tf.set_random_seed(FLAGS.random_seed) if hps.mode == 'train': model = seq2seq_attention_model.Seq2SeqAttentionModel( hps, vocab, num_gpus=FLAGS.num_gpus) eval_model = seq2seq_attention_model.Seq2SeqAttentionModel( eval_hps, vocab, num_gpus=FLAGS.num_gpus ) count = 0 while count * FLAGS.eval_every_iteration < FLAGS.max_run_steps: _Train(model, batcher) eval_avg_loss = 0 # read previous loss from eval_dir (if any) try: eval_results = tf.contrib.estimator.read_eval_metrics(FLAGS.eval_dir) i = 0 for step, metrics in eval_results.items(): eval_avg_loss += metrics['running_avg_loss'] i += 1 prev_avg_loss = eval_avg_loss / i except FileNotFoundError: print("Haven't run evaluation yet.") cur_loss = _Eval(eval_model, eval_batcher, 20, vocab=vocab) if eval_avg_loss is not 0 and prev_avg_loss < cur_loss: print("Early stopping!") break count += 1 elif hps.mode == 'eval': model = seq2seq_attention_model.Seq2SeqAttentionModel( hps, vocab, num_gpus=FLAGS.num_gpus) _Eval(model, eval_batcher, vocab=vocab) elif hps.mode == 'decode': decode_mdl_hps = hps # Only need to restore the 1st step and reuse it since # we keep and feed in state for each step's output. decode_mdl_hps = hps._replace(dec_timesteps=1) model = seq2seq_attention_model.Seq2SeqAttentionModel( decode_mdl_hps, vocab, num_gpus=FLAGS.num_gpus) decoder = seq2seq_attention_decode.BSDecoder(model, batcher, hps, vocab) decoder.DecodeLoop()
def main(): # torch.manual_seed(233) set_seed(233) logging.basicConfig(level=logging.INFO, format='%(asctime)s [INFO] %(message)s') parser = argparse.ArgumentParser() parser.add_argument( '--data_path', type=str, dest='data_path', default='/home/ml/ydong26/data/EditNTS_data/editnet_data/%s/' % dataset, help='Path to train vocab_data') parser.add_argument('--store_dir', action='store', dest='store_dir', default='/home/ml/ydong26/tmp_store/editNTS_%s' % dataset, help='Path to exp storage directory.') parser.add_argument('--vocab_path', type=str, dest='vocab_path', default='../vocab_data/', help='Path contains vocab, embedding, postag_set') parser.add_argument( '--load_model', type=str, dest='load_model', default=None, help='Path for loading pre-trained model for further training') parser.add_argument('--vocab_size', dest='vocab_size', default=30000, type=int) parser.add_argument('--batch_size', dest='batch_size', default=32, type=int) parser.add_argument('--max_seq_len', dest='max_seq_len', default=100) parser.add_argument('--epochs', type=int, default=50) parser.add_argument('--hidden', type=int, default=200) parser.add_argument('--lr', type=float, default=1e-4) parser.add_argument('--device', type=int, default=1, help='select GPU') parser.add_argument('--test', action='store_true', default=False, dest='test_enabled') parser.add_argument('--run_eval', action='store_true', default=False, dest='run_eval') parser.add_argument('--run_training', action='store_true', default=False, dest='run_training') #train_file = '/media/vocab_data/yue/TS/editnet_data/%s/train.df.filtered.pos'%dataset # test='/media/vocab_data/yue/TS/editnet_data/%s/test.df.pos' % args.dataset args = parser.parse_args() print(args) torch.cuda.set_device(args.device) # load vocab-related files and init vocab print('*' * 10) vocab = data.Vocab() vocab.add_vocab_from_file(args.vocab_path + 'vocab.txt', args.vocab_size) vocab.add_embedding(gloveFile=args.vocab_path + 'glove.6B.100d.txt') pos_vocab = data.POSvocab(args.vocab_path) #load pos-tags embeddings print('*' * 10) print(args) print("generating config") hyperparams = collections.namedtuple( 'hps', #hyper=parameters [ 'vocab_size', 'embedding_dim', 'word_hidden_units', 'sent_hidden_units', 'pretrained_embedding', 'word2id', 'id2word', 'pos_vocab_size', 'pos_embedding_dim' ]) hps = hyperparams(vocab_size=vocab.count, embedding_dim=100, word_hidden_units=args.hidden, sent_hidden_units=args.hidden, pretrained_embedding=vocab.embedding, word2id=vocab.w2i, id2word=vocab.i2w, pos_vocab_size=pos_vocab.count, pos_embedding_dim=30) print('init editNTS model') edit_net = EditNTS(hps, n_layers=1) edit_net.cuda() if args.load_model is not None: print("load edit_net for further training") ckpt_path = args.load_model ckpt = Checkpoint.load(ckpt_path) print("Epoch: {} | Step: {}".format(ckpt.epoch, ckpt.step)) edit_net = ckpt.model edit_net.cuda() edit_net.train() if args.run_eval: print("Running Evaluation..") eval_standalone(edit_net, args, vocab, ckpt) elif args.run_training: print("Running Training..") training(edit_net, args.epochs, args, vocab, test=args.test_enabled) else: print("ERROR: No running mode selected")
# DATA_PATH = "/home/synerzip/Sasidhar/Learning/Tensorflow/textsum/data/reviews" # VOCAB_PATH = "/home/synerzip/Sasidhar/Learning/Tensorflow/textsum/data/vocab_1" # LOG_ROOT = "/home/synerzip/Sasidhar/Learning/Tensorflow/textsum/log_root" FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string('data_path', DATA_PATH, 'data path') tf.app.flags.DEFINE_string('vocab_path', VOCAB_PATH, 'Path expression to text vocabulary file.') tf.app.flags.DEFINE_string('log_root', LOG_ROOT, 'Directory for model root.') tf.app.flags.DEFINE_integer('beam_size', 4, 'beam size for beam search decoding.') tf.app.flags.DEFINE_integer('random_seed', 111, 'A seed value for randomness.') tf.app.flags.DEFINE_integer('num_gpus', 0, 'Number of gpus used.') vocab = data.Vocab(FLAGS.vocab_path, 10003) batch_size = 4 hps = seq2seq_attention_model.HParams( mode='decode', min_lr=0.01, # min learning rate. lr=0.15, # learning rate batch_size=batch_size, enc_layers=4, enc_timesteps=200, dec_timesteps=30, min_input_len=2, # discard articles/summaries < than this num_hidden=256, # for rnn cell emb_dim=128, # If 0, don't use embedding max_grad_norm=2,
else: yield (background_text, context_text, response_text, span_text, b_start, b_end, r_start, r_end, example_id) if __name__ == '__main__': hps_dict = { 'mode': 'train', 'batch_size': 16, 'max_bac_enc_steps': 300, 'max_con_enc_steps': 65, 'max_dec_steps': 95 } hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) vocab = data.Vocab('data/mixed_context/finished_files/vocab', 25000) batcher = Batcher('data/mixed_context/finished_files/chunked/train_*', vocab, hps, single_pass=False) batch = batcher.next_batch() # print("batch.target_batch: ",batch.target_batch) i = 0 print() print("backgrounds: ", batch.original_backgrounds[i], "\n") print("contexts: ", batch.original_contexts[i], "\n") print("responses: ", batch.original_responses[i], "\n") print("spans: ", batch.original_spans[i], "\n") print("b_starts: ", batch.original_b_starts[i], "\n") print("b_ends: ", batch.original_b_ends[i], "\n")
if __name__ == '__main__': args = get_parser().parse_args() os.makedirs(args.prediction_log_dir, exist_ok=True) sequentialization_client = AstSequentializationApiClient( args.sequentialization_api_host, args.sequentialization_api_port, ) if args.format == 'AST': source_code_processor = AstSequenceProcessor(sequentialization_client) else: source_code_processor = TokenizedCodeProcessor( sequentialization_client) Evaluator( GwtSectionPredictionTransformer, parse_sampler_settings(args.sampler_settings), args.evaluation_dataset_path, data.Vocab(args.vocab_path), bpe.BpeProcessor(args.bpe_model_path), source_code_processor, args.max_prediction_length, args.num_workers, args.device, args.write_results_to_tensorboard, args.prediction_log_dir, args.log_interval, args.evaluation_dataset_ids_path, ).evaluate(args.tensorboard_log_dir, args.max_number_of_checkpoints)
def main(mode_type): # 读取词表 vocab = data.Vocab( os.path.join(parameter_config.VOCAB_DIR, parameter_config.VOCAB_FILE_NAME), parameter_config.VOCAB_SIZE) batch_size = parameter_config.BATCH_SIZE if mode_type == 'decode': batch_size = 1 # 设置模型超参数 hps = seq2seq_model.HParams( mode=mode_type, # train, eval, decode batch_size=batch_size, enc_timesteps=parameter_config.ENC_TIMESTEPS, emb_dim=parameter_config.EMB_DIM, min_input_len=parameter_config.MIN_INPUT_LEN, num_hidden=parameter_config.NUM_HIDDEN, enc_layers=parameter_config.ENC_LAYERS, min_lr=parameter_config.MIN_LR, lr=parameter_config.LR, max_grad_norm=parameter_config.MAX_GRAD_NORM) tf.set_random_seed(111) if hps.mode == 'train': batcher = batch_reader.Batcher(parameter_config.TRAIN_DIR, vocab, 'index', 'target', 'sentence', hps, bucketing=False, truncate_input=True) model = seq2seq_model.Seq2SeqModel(hps, vocab, num_gpus=0) _Train(model, batcher, parameter_config.TRAIN_STEP) elif hps.mode == 'eval': batcher = batch_reader.Batcher(parameter_config.EVALUATION_SET, vocab, 'index', 'target', 'sentence', hps, bucketing=False, truncate_input=True) model = seq2seq_model.Seq2SeqModel(hps, vocab, num_gpus=0) _Eval(model, batcher) elif hps.mode == 'decode': batcher = batch_reader.Batcher(parameter_config.DECODE_DIR, vocab, 'index', 'target', 'sentence', hps, bucketing=False, truncate_input=True) model = seq2seq_model.Seq2SeqModel(hps, vocab, num_gpus=0) if not os.path.exists( os.path.join(os.getcwd(), parameter_config.DECODE_STORE_DIR)): os.mkdir( os.path.join(os.getcwd(), parameter_config.DECODE_STORE_DIR)) _Decode( model, batcher, os.path.join(parameter_config.DECODE_STORE_DIR, parameter_config.DECODE_STORE_FILE)) elif hps.mode == 'eval_step': model = seq2seq_model.Seq2SeqModel(hps, vocab, num_gpus=0) _Eval_Step(model) else: print('mode_type must be train eval decode or eval_step')
def main(unused_argv): tf.logging.set_verbosity(tf.logging.INFO) vocab = data.Vocab(FLAGS.vocab_path, 1000000) # Check for presence of required special tokens. assert vocab.CheckVocab(data.PAD_TOKEN) > 0 assert vocab.CheckVocab(data.UNKNOWN_TOKEN) >= 0 assert vocab.CheckVocab(data.START_DECODING) > 0 assert vocab.CheckVocab(data.STOP_DECODING) > 0 batch_size = 4 if FLAGS.mode == 'decode': batch_size = FLAGS.beam_size hps = seq2seq_attention_model.HParams( mode=FLAGS.mode, # train, eval, decode min_lr=0.01, # min learning rate. lr=0.15, # learning rate batch_size=batch_size, enc_layers=1, enc_timesteps=800, dec_timesteps=200, min_input_len=2, # discard articles/summaries < than this num_hidden=256, # for rnn cell emb_dim=128, # If 0, don't use embedding max_grad_norm=2, num_softmax_samples=4096, # If 0, no sampled softmax. trunc_norm_init_std=0.05) batcher = batch_reader.Batcher(FLAGS.data_path, vocab, hps, FLAGS.article_id_key, FLAGS.article_key, FLAGS.abstract_key, FLAGS.labels_key, FLAGS.section_names_key, FLAGS.sections_key, FLAGS.max_article_sentences, FLAGS.max_abstract_sentences, bucketing=FLAGS.use_bucketing, truncate_input=FLAGS.truncate_input) tf.set_random_seed(FLAGS.random_seed) if hps.mode == 'train': model = seq2seq_attention_model.Seq2SeqAttentionModel( hps, vocab, num_gpus=FLAGS.num_gpus) _Train(model, batcher) elif hps.mode == 'eval': model = seq2seq_attention_model.Seq2SeqAttentionModel( hps, vocab, num_gpus=FLAGS.num_gpus) _Eval(model, batcher, vocab=vocab) elif hps.mode == 'decode': decode_mdl_hps = hps # Only need to restore the 1st step and reuse it since # we keep and feed in state for each step's output. decode_mdl_hps = hps._replace(dec_timesteps=1) model = seq2seq_attention_model.Seq2SeqAttentionModel( decode_mdl_hps, vocab, num_gpus=FLAGS.num_gpus) decoder = seq2seq_attention_decode.BeamSearchDecoder( model, batcher, hps, vocab) decoder.decode_loop()
vocab_path = os.path.join(pkg_path, "data/textsum/data/vocab.txt") data_path = os.path.join(pkg_path, "data/textsum/data/train.txt") hps = seq2seq_attention_model.HParams( mode='train', # train, eval, decode min_lr=0.01, # min learning rate. lr=0.15, # learning rate batch_size=2, enc_layers=2, enc_timesteps=100, dec_timesteps=20, min_input_len=2, # discard articles/summaries < than this num_hidden=256, # for rnn cell emb_dim=128, # If 0, don't use embedding max_grad_norm=2, num_softmax_samples=0) # If 0, no sampled softmax. vocab = data.Vocab(vocab_path, 10000) dataset = read_data_sets(data_path, vocab, hps) article_batch, abstract_batch, targets, source_article, source_abstract = dataset.next_batch( hps.batch_size) print(article_batch) print(source_article) print(abstract_batch) print(source_abstract) print(targets) print("\n") article_batch1, abstract_batch1, targets1, _, _ = dataset.next_batch( hps.batch_size)
def main(unused_argv): vocab = data.Vocab(FLAGS.vocab_path, 1000000) # Check for presence of required special tokens. assert vocab.CheckVocab(data.PAD_TOKEN) > 0 assert vocab.CheckVocab(data.UNKNOWN_TOKEN) >= 0 assert vocab.CheckVocab(data.SENTENCE_START) > 0 assert vocab.CheckVocab(data.SENTENCE_END) > 0 batch_size = 4 if FLAGS.mode == 'decode': batch_size = FLAGS.beam_size hps = seq2seq_attention_model.HParams( mode=FLAGS.mode, # train, eval, decode min_lr=0.01, # min learning rate. lr=0.15, # learning rate batch_size=batch_size, enc_layers=1, enc_timesteps=120, dec_timesteps=30, min_input_len=2, # discard articles/summaries < than this num_hidden=128, # for rnn cell emb_dim=128, # If 0, don't use embedding max_grad_norm=2, num_softmax_samples=4096) # If 0, no sampled softmax. batcher = batch_reader.Batcher(FLAGS.data_path, vocab, hps, FLAGS.article_key, FLAGS.abstract_key, FLAGS.max_article_sentences, FLAGS.max_abstract_sentences, bucketing=FLAGS.use_bucketing, truncate_input=FLAGS.truncate_input) tf.set_random_seed(FLAGS.random_seed) if hps.mode == 'train': model = seq2seq_attention_model.Seq2SeqAttentionModel( hps, vocab, num_gpus=FLAGS.num_gpus) _Train(model, batcher) elif hps.mode == 'eval': model = seq2seq_attention_model.Seq2SeqAttentionModel( hps, vocab, num_gpus=FLAGS.num_gpus) _Eval(model, batcher, vocab=vocab) elif hps.mode == 'decode': decode_mdl_hps = hps # Only need to restore the 1st step and reuse it since # we keep and feed in state for each step's output. decode_mdl_hps = hps._replace(dec_timesteps=1) model = seq2seq_attention_model.Seq2SeqAttentionModel( decode_mdl_hps, vocab, num_gpus=FLAGS.num_gpus) to_build_grapth = True p = preprocessing(FLAGS.vocab_path) # 舊的decode迴圈 # while True: # kb_input = input('> ') # if kb_input == 'c': # description_str = input('輸入description > ') # context_str = input('輸入context> ') # input_data = p.get_data(description=description_str, context=context_str) # print('輸入資料:') # pprint(input_data) # elif kb_input == 'q': # break # else: # try: # text_to_binary('yahoo_knowledge_data/decode/ver_5/dataset_ready/data_ready_' + kb_input, # 'yahoo_knowledge_data/decode/decode_data') # except: # print('預設testing data出現錯誤') # decoder = seq2seq_attention_decode.BSDecoder(model, hps, vocab, to_build_grapth) # to_build_grapth = False # decoder.DecodeLoop() # 論文用的decode迴圈 file_num = 1 while True: if file_num % 60 == 0: print('已經印60筆') break try: text_to_binary( 'yahoo_knowledge_data/decode/ver_5/dataset_ready/data_ready_' + str(file_num), 'yahoo_knowledge_data/decode/decode_data') except: print('預設testing data出現錯誤') break decoder = seq2seq_attention_decode.BSDecoder( model, hps, vocab, to_build_grapth) to_build_grapth = False decoder.DecodeLoop() print('==================', file_num, '==================') file_num += 1
import data from checkpoint import Checkpoint from editnts import EditNTS from evaluator import Evaluator import torch from torch import nn from argparse import ArgumentParser import collections vocab = data.Vocab() vocab.add_vocab_from_file('vocab_data/vocab.txt', 30000) vocab.add_embedding(gloveFile='vocab_data/glove.6B.100d.txt') pos_vocab = data.POSvocab('vocab_data') print("generating config") hyperparams = collections.namedtuple( 'hps', #hyper=parameters [ 'vocab_size', 'embedding_dim', 'word_hidden_units', 'sent_hidden_units', 'pretrained_embedding', 'word2id', 'id2word', 'pos_vocab_size', 'pos_embedding_dim' ]) # hps = hyperparams( # vocab_size=vocab.count, # embedding_dim=100, # word_hidden_units=200, # sent_hidden_units=200, # pretrained_embedding=vocab.embedding, # word2id=vocab.w2i, # id2word=vocab.i2w,
def train(hps, device, summary): train_corpus = data.Corpus(hps['train_corpus'], hps['tokenization']) eval_corpus = data.Corpus(hps['eval_corpus'], hps['tokenization']) test_corpus = None token_list = train_corpus.export_token_list( ) + eval_corpus.export_token_list() if hps['test_corpus']: test_corpus = data.Corpus(hps['test_corpus'], hps['tokenization']) token_list += test_corpus.export_token_list() vocab = data.Vocab(token_list) vocab.save(hps['vocab_file']) vocab = data.Vocab.load(hps['vocab_file']) train_corpus.tokenize(vocab) eval_corpus.tokenize(vocab) ntokens = vocab.size() m = model.RNNModel( ntokens, hps['emsize'], hps['nhid'], hps['nlayers'], hps['dropout'], hps['tied'], ).to(device) criterion = torch.nn.CrossEntropyLoss() train_data = batchify(train_corpus.ids, hps['batch_size'], device) eval_data = batchify(eval_corpus.ids, hps['batch_size'], device) if test_corpus is not None: test_corpus.tokenize(vocab) test_data = batchify(test_corpus.ids, hps['batch_size'], device) else: test_data = None lr = hps['lr'] best_val_loss = None val_loss_not_improved = 0 n_batches = len(train_data) // hps['bptt'] test_loss, test_perp = -1, -1 # At any point you can hit Ctrl + C to break out of training early. print('-' * 95) try: for epoch in range(1, hps['epochs'] + 1): # Train for one epoch epoch_start_time = time.time() train_epoch(m, criterion, train_data, vocab, hps, lr, epoch, device, summary) elapsed = time.time() - epoch_start_time step = hps['batch_size'] * helpers.get_num_batches_seen( epoch, n_batches, n_batches) # Evaluate model on validation set with torch.no_grad(): val_loss = evaluate(m, criterion, eval_data, vocab, hps) val_perp = math.exp(val_loss) summary.add_scalar('ValidationLoss', val_loss, step) summary.add_scalar('ValidationPerp', val_perp, step) # Evaluate model on test set if test_data is not None: with torch.no_grad(): test_loss = evaluate(m, criterion, test_data, vocab, hps) test_perp = math.exp(test_loss) test_bpc = test_loss * math.log2(math.e) summary.add_scalar('TestLoss', test_loss, step) summary.add_scalar('TestPerp', test_perp, step) summary.add_scalar('TestBPC', test_bpc, step) helpers.log_end_of_epoch(epoch, elapsed, val_loss, val_perp, test_loss, test_perp) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: os.makedirs(os.path.dirname(hps['save']), exist_ok=True) with open(hps['save'], 'wb') as f: torch.save(m, f) best_val_loss = val_loss else: val_loss_not_improved += 1 if val_loss_not_improved == 3: # Anneal the learning rate if no improvement has been seen in the validation dataset. lr /= 4 val_loss_not_improved = 0 summary.add_scalar('LR', lr, step) except KeyboardInterrupt: print('-' * 89) print('Exiting from training early')
self.example_q_threads[idx] = new_t new_t.daemon = True new_t.start() for idx, t in enumerate(self.batch_q_threads): if not t.is_alive(): # if the thread is dead tf.logging.error( "Found batch queue thread dead. Restarting.") new_t = Thread(target=self.fill_batch_queue) self.batch_q_threads[idx] = new_t new_t.daemon = True new_t.start() ############################################################################### if __name__ == "__main__": vocab = data.Vocab("./data/vocab.txt", 0, 50) print vocab.tag_to_id print vocab.id_to_tag hps_dict = { "batch_size": 4, "max_steps": 50, "mode": "train", "single_pass": False, } hps = namedtuple("hps", hps_dict.keys())(**hps_dict) example = Example("现代化的战舰上", "BMESMES", vocab, hps) print example.sentence print example.label print example.len
import data if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('--content', metavar='content', ) args = parser.parse_args() hps = seq2seq_attention_model.HParams( mode='eval', # train, eval, decode min_lr=0.01, # min learning rate. lr=0.15, # learning rate batch_size=1, enc_layers=4, enc_timesteps=120, dec_timesteps=30, min_input_len=2, # discard articles/summaries < than this num_hidden=256, # for rnn cell emb_dim=128, # If 0, don't use embedding max_grad_norm=2, num_softmax_samples=4096) vocab = data.Vocab('vocabulary.txt', 1000000) model = seq2seq_attention_model.Seq2SeqAttentionModel(hps, vocab, num_gpus=0) model.build_graph() saver = tf.train.Saver() sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) ckpt_state = tf.train.get_checkpoint_state('model_save') (summaries, loss, train_step) = model.run_eval_step( sess, [args.content], [''], targets, article_lens, abstract_lens, loss_weights)
def train(hps): train_corpus = data.Corpus(hps['train_corpus'], hps['tokenization']) eval_corpus = data.Corpus(hps['eval_corpus'], hps['tokenization']) token_list = train_corpus.export_token_list() + eval_corpus.export_token_list() if hps['test_corpus']: test_corpus = data.Corpus(hps['test_corpus'], hps['tokenization']) token_list += test_corpus.export_token_list() else: test_corpus = None vocab = data.Vocab(token_list) vocab.save(hps['vocab_file']) vocab = data.Vocab.load(hps['vocab_file']) train_corpus.tokenize(vocab) eval_corpus.tokenize(vocab) ntokens = vocab.size() m = model.RNNModel( ntokens, hps['emsize'], hps['nhid'], hps['nlayers'], hps['dropout'], hps['tied'], ) if hps['cuda']: m.cuda() criterion = torch.nn.CrossEntropyLoss() train_data = batchify(train_corpus.ids, hps['batch_size'], hps['cuda']) eval_data = batchify(eval_corpus.ids, hps['batch_size'], hps['cuda']) if test_corpus is not None: test_corpus.tokenize(vocab) test_data = batchify(test_corpus.ids, hps['batch_size'], hps['cuda']) else: test_data = None lr = hps['lr'] best_val_loss = None # At any point you can hit Ctrl + C to break out of training early. print('-' * 95) try: for epoch in range(1, hps['epochs'] + 1): epoch_start_time = time.time() train_epoch(m, criterion, train_data, vocab, hps, lr, epoch) val_loss = evaluate(m, criterion, eval_data, vocab, hps) print('-' * 95) print( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:14.8f}'.format( epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) if test_data is not None: test_loss = evaluate(m, criterion, test_data, vocab, hps) print( '| | test loss {:5.2f} | ' ' test ppl {:14.8f}'.format(test_loss, math.exp(test_loss))) print('-' * 95) # Save the model if the validation loss is the best we've seen so # far. if not best_val_loss or val_loss < best_val_loss: os.makedirs(os.path.dirname(hps['save']), exist_ok=True) if hps['cuda']: with open(hps['save'] + '.gpu', 'wb') as f: torch.save(m, f) m.cpu() with open(hps['save'] + '.cpu', 'wb') as f: torch.save(m, f) m.cuda() else: with open(hps['save'] + '.cpu', 'wb') as f: torch.save(m, f) best_val_loss = val_loss else: # Anneal the learning rate if no improvement has been seen in # the validation dataset. lr /= 4 except KeyboardInterrupt: print('-' * 89) print('Exiting from training early')