def _test(config): test_data = read_data(config, 'test', True) update_config(config, [test_data]) _config_debug(config) if config.use_glove_for_unk: word2vec_dict = test_data.shared[ 'lower_word2vec'] if config.lower_word else test_data.shared[ 'word2vec'] new_word2idx_dict = test_data.shared['new_word2idx'] idx2vec_dict = { idx: word2vec_dict[word] for word, idx in new_word2idx_dict.items() } new_emb_mat = np.array( [idx2vec_dict[idx] for idx in range(len(idx2vec_dict))], dtype='float32') config.new_emb_mat = new_emb_mat pprint(config.__flags, indent=2) models = get_multi_gpu_models(config) model = models[0] evaluator = MultiGPUEvaluator( config, models, tensor_dict=models[0].tensor_dict if config.vis else None) graph_handler = GraphHandler(config, model) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) graph_handler.initialize(sess) num_steps = math.ceil(test_data.num_examples / (config.batch_size * config.num_gpus)) if 0 < config.test_num_batches < num_steps: num_steps = config.test_num_batches e = None for multi_batch in tqdm(test_data.get_multi_batches( config.batch_size, config.num_gpus, num_steps=num_steps, cluster=config.cluster), total=num_steps): ei = evaluator.get_evaluation(sess, multi_batch) e = ei if e is None else e + ei if config.vis: eval_subdir = os.path.join( config.eval_dir, "{}-{}".format(ei.data_type, str(ei.global_step).zfill(6))) if not os.path.exists(eval_subdir): os.mkdir(eval_subdir) path = os.path.join(eval_subdir, str(ei.idxs[0]).zfill(8)) graph_handler.dump_eval(ei, path=path) print("test acc: %f, loss: %f" % (e.acc, e.loss)) if config.dump_answer: print("dumping answer ...") graph_handler.dump_answer(e) if config.dump_eval: print("dumping eval ...") graph_handler.dump_eval(e)
def train(config): train_data = read_data('train') dev_data = read_data('dev') update_config(config, [train_data, dev_data]) _config_debug(config) word2vec_dict = train_data.shared['lower_word2vec'] word2idx_dict = train_data.shared['word2idx'] idx2vec_dict = {word2idx_dict[word]: vec for word, vec in word2vec_dict.items() if word in word2idx_dict} emb_mat = np.array([idx2vec_dict[idx] if idx in idx2vec_dict else np.random.multivariate_normal(np.zeros(config.word_emb_size), np.eye(config.word_emb_size)) for idx in range(config.word_vocab_size)]) config.emb_mat = emb_mat bidaf_model = train_bidaf()
def _test(config): test_data = read_data(config, 'test', True) update_config(config, [test_data]) _config_debug(config) pprint(config.__flags, indent=2) models = get_multi_gpu_models(config) model = models[0] evaluator = AccuracyEvaluator(config.test_num_can, config, model, tensor_dict=models[0].tensor_dict if config.vis else None) graph_handler = GraphHandler(config, model) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) graph_handler.initialize(sess) num_steps = math.ceil(test_data.num_examples / (config.batch_size * config.num_gpus)) e = None tensor=[] for i, multi_batch in enumerate(tqdm( test_data.get_multi_batches(config.batch_size, config.num_gpus, num_steps=num_steps, cluster=config.cluster), total=num_steps)): ei = evaluator.get_evaluation(sess, multi_batch) # outfinal=ei.tensor # tensor.extend(outfinal) e = ei if e is None else e + ei # if config.vis: # eval_subdir = os.path.join(config.eval_dir, # "{}-{}".format(multi_batch[0][1].data_type, str(ei.global_step).zfill(6))) # if not os.path.exists(eval_subdir): # os.mkdir(eval_subdir) # path = os.path.join(eval_subdir, str(ei.idxs[0]).zfill(8)) # graph_handler.dump_eval(ei, path=path) print(e.acc) if config.dump_eval: print("dumping eval ...") graph_handler.dump_eval(e) if config.dump_answer: print("dumping answers ...") graph_handler.dump_answer(e)
def main(): config = parse_args() data_filter = get_squad_data_filter(config) train_data = read_data(config, 'train', config.load, data_filter=data_filter) dev_data = read_data(config, 'dev', True, data_filter=data_filter) update_config(config, [train_data, dev_data]) _config_debug(config) print("Total vocabulary for training is %s" % config.word_vocab_size) word2vec_dict = train_data.shared[ 'lower_word2vec'] if config.lower_word else train_data.shared[ 'word2vec'] word2idx_dict = train_data.shared['word2idx'] idx2vec_dict = { word2idx_dict[word]: vec for word, vec in word2vec_dict.items() if word in word2idx_dict } # if Glove use the vector, otherwise, assigns random value emb_mat = np.array([ idx2vec_dict[idx] if idx in idx2vec_dict else np.random.multivariate_normal( np.zeros(config.word_emb_size), np.eye(config.word_emb_size)) for idx in range(config.word_vocab_size) ]) config.emb_mat = emb_mat ## Initialize model model = BiDAF(config) if config.use_gpu: model.cuda() optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=0.5) ## Begin training num_steps = config.num_steps or int( math.ceil(train_data.num_examples / (config.batch_size * config.num_gpus))) * config.num_epochs global_step = 0 print(num_steps) count = 1 train_loss = [] for batches in tqdm(train_data.get_multi_batches(config.batch_size, config.num_gpus, num_steps=num_steps, shuffle=True, cluster=config.cluster), total=num_steps): model.train() model.zero_grad() model(batches) loss = model.build_loss() loss.backward() optimizer.step() if count % 100 == 0: eval_loss = eval_model(model, dev_data, config) print("train loss is: %.3f" % loss.data.cpu().numpy()[0]) print("eval loss is: %.3f \n" % eval_loss) model.train() count += 1 return
def _train(config): np.set_printoptions(threshold=np.inf) train_data = read_data(config, 'train', config.load) dev_data = read_data(config, 'dev', True) update_config(config, [train_data, dev_data]) _config_debug(config) word2vec_dict = train_data.shared[ 'lower_word2vec'] if config.lower_word else train_data.shared[ 'word2vec'] word2idx_dict = train_data.shared['word2idx'] idx2vec_dict = { word2idx_dict[word]: vec for word, vec in word2vec_dict.items() if word in word2idx_dict } emb_mat = np.array([ idx2vec_dict[idx] if idx in idx2vec_dict else np.random.multivariate_normal( np.zeros(config.word_emb_size), np.eye(config.word_emb_size)) for idx in range(config.word_vocab_size) ]) config.emb_mat = emb_mat def make_idx2word(): """ return index of the word from the preprocessed dictionary. """ idx2word = {} d = train_data.shared['word2idx'] for word, idx in d.items(): print(word) idx2word[idx] = word if config.use_glove_for_unk: d2 = train_data.shared['new_word2idx'] for word, idx in d2.items(): print(word) idx2word[idx + len(d)] = word return idx2word idx2word = make_idx2word() # Save total number of words used in this dictionary: words in GloVe + etc tokens(including UNK, POS, ... etc) print("size of config.id2word len:", len(idx2word)) print("size of config.total_word_vocab_size:", config.total_word_vocab_size) # construct model graph and variables (using default graph) pprint(config.__flags, indent=2) models = get_multi_gpu_models(config) model = models[0] print("num params: {}".format(get_num_params())) trainer = MultiGPUTrainer(config, models) evaluator = MultiGPUEvaluator( config, models, tensor_dict=model.tensor_dict if config.vis else None) graph_handler = GraphHandler( config, model ) # controls all tensors and variables in the graph, including loading /saving # Variables sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) graph_handler.initialize(sess) num_steps = config.num_steps or int( math.ceil(train_data.num_examples / (config.batch_size * config.num_gpus))) * config.num_epochs min_val = {} min_val['loss'] = 100.0 min_val['acc'] = 0 min_val['step'] = 0 min_val['patience'] = 0 for batches in tqdm(train_data.get_multi_batches(config.batch_size, config.num_gpus, num_steps=num_steps, shuffle=True, cluster=config.cluster), total=num_steps): global_step = sess.run( model.global_step ) + 1 # +1 because all calculations are done after step get_summary = global_step % config.log_period == 0 loss, summary, train_op = trainer.step(sess, batches, get_summary=get_summary) if get_summary: graph_handler.add_summary(summary, global_step) # occasional saving if global_step % config.save_period == 0: graph_handler.save(sess, global_step=global_step) if not config.eval: continue # Occasional evaluation if global_step % config.eval_period == 0: num_steps = math.ceil(dev_data.num_examples / (config.batch_size * config.num_gpus)) # num_steps: total steps to finish this training session. # val_num_batches: 100 if 0 < config.val_num_batches < num_steps: # if config.val_num_batches is less the the actual steps required to run whole dev set. Run evaluation up to the step. num_steps = config.val_num_batches # This train loss is calulated from sampling the same number of data size of dev_data. e_train = evaluator.get_evaluation_from_batches( sess, tqdm(train_data.get_multi_batches(config.batch_size, config.num_gpus, num_steps=num_steps), total=num_steps)) graph_handler.add_summaries(e_train.summaries, global_step) # This e_dev may differ from the dev_set used in test time because some data is filtered out here. e_dev = evaluator.get_evaluation_from_batches( sess, tqdm(dev_data.get_multi_batches(config.batch_size, config.num_gpus, num_steps=num_steps), total=num_steps)) graph_handler.add_summaries(e_dev.summaries, global_step) print("%s e_train: loss=%.4f" % (header, e_train.loss)) print("%s e_dev: loss=%.4f" % (header, e_dev.loss)) print() if min_val['loss'] > e_dev.loss: min_val['loss'] = e_dev.loss min_val['step'] = global_step min_val['patience'] = 0 else: min_val['patience'] = min_val['patience'] + 1 if min_val['patience'] >= 1000: slack.notify( text="%s patience reached %d. early stopping." % (header, min_val['patience'])) break slack.notify(text="%s e_dev: loss=%.4f" % (header, e_dev.loss)) if config.dump_eval: graph_handler.dump_eval(e_dev) if config.dump_answer: graph_handler.dump_answer(e_dev) slack.notify( text= "%s <@U024BE7LH|insikk> Train is finished. e_dev: loss=%.4f at step=%d\nPlease assign another task to get more research result" % (header, min_val['loss'], min_val['step'])) if global_step % config.save_period != 0: graph_handler.save(sess, global_step=global_step)
def main(): config = parse_args() data_filter = get_squad_data_filter(config) train_data = read_data(config, 'train', config.load, data_filter=data_filter) dev_data = read_data(config, 'dev', True, data_filter=data_filter) #print("Total vocabulary for training is %s" % config.word_vocab_size) #print(train_data.shared['x'][0][0]) #print(train_data.shared['x'][0][1]) #print(train_data.data['*x'][0]) update_config(config, [train_data, dev_data]) #_config_debug(config) print("Total vocabulary for training is %s" % config.word_vocab_size) # from all word2vec_dict = train_data.shared[ 'lower_word2vec'] if config.lower_word else train_data.shared[ 'word2vec'] # from filter-out set word2idx_dict = train_data.shared['word2idx'] # filter-out set idx-vector idx2vec_dict = { word2idx_dict[word]: vec for word, vec in word2vec_dict.items() if word in word2idx_dict } print("{}/{} unique words have corresponding glove vectors.".format( len(idx2vec_dict), len(word2idx_dict))) # <null> and <unk> do not have corresponding vector so random. emb_mat = np.array([ idx2vec_dict[idx] if idx in idx2vec_dict else np.random.multivariate_normal( np.zeros(config.word_emb_size), np.eye(config.word_emb_size)) for idx in range(config.word_vocab_size) ]) config.emb_mat = emb_mat config.new_emb_mat = train_data.shared['new_emb_mat'] print(emb_mat.shape, config.new_emb_mat.shape) ## Initialize model model = BiDAF(config) if config.use_gpu: model.cuda() #optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=config.init_lr) print("learning rate is: %.4f" % config.init_lr) optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=config.init_lr) ## Begin training num_steps = config.num_steps or int( math.ceil(train_data.num_examples / (config.batch_size * config.num_gpus))) * config.num_epochs global_step = 0 train_loss = [] count = 0 for batches in tqdm(train_data.get_multi_batches(config.batch_size, config.num_gpus, num_steps=num_steps, shuffle=True, cluster=config.cluster), total=num_steps): model.train() model.zero_grad() model(batches) model.loss = model.build_loss() model.loss.backward() optimizer.step() if config.test_run: eval_model(model, train_data, dev_data, config) break else: if count % 500 == 0: #print("train loss is: %.4f" % model.loss.data.cpu().numpy()[0]) eval_model(model, train_data, dev_data, config) #print("eval loss is: %.4f \n" % eval_loss) count += 1 return
def _train(config): train_data = read_data(config, 'val_train', config.load) dev_data = read_data(config, 'val_val', True) # test = read_data(config, 'test', True) update_config(config, [train_data, dev_data]) _config_debug(config) word2vec_dict = train_data.shared['lower_word2vec'] if config.lower_word else train_data.shared['word2vec'] word2idx_dict = train_data.shared['word2idx'] idx2vec_dict = {word2idx_dict[word]: vec for word, vec in word2vec_dict.items() if word in word2idx_dict} emb_mat = np.array([idx2vec_dict[idx] if idx in idx2vec_dict else np.random.multivariate_normal(np.zeros(config.word_emb_size), np.eye(config.word_emb_size)) for idx in range(config.word_vocab_size)]) config.emb_mat = emb_mat pprint(config.__flags, indent=2) models = get_multi_gpu_models(config) model = models[0] print("num params: {}".format(get_num_params())) trainer = MultiGPUTrainer(config, models) evaluator = AccuracyEvaluator(config.train_num_can, config, model, tensor_dict=model.tensor_dict if config.vis else None) graph_handler = GraphHandler(config, model) # controls all tensors and variables in the graph, including loading /saving # Variables sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) graph_handler.initialize(sess) # Begin training num_steps = config.num_steps or int( math.ceil(train_data.num_examples / (config.batch_size * config.num_gpus))) * config.num_epochs global_step = 0 best_dev=[0,0] for batches in tqdm(train_data.get_multi_batches(config.batch_size, config.num_gpus, num_steps=num_steps, shuffle=False, cluster=config.cluster), total=num_steps): global_step = sess.run(model.global_step) + 1 # +1 because all calculations are done after step get_summary = global_step % config.log_period == 0 loss, summary, train_op = trainer.step(sess, batches, get_summary=get_summary) if get_summary: graph_handler.add_summary(summary, global_step) if not config.eval: continue if global_step % config.eval_period == 0: num_steps_dev = math.ceil(dev_data.num_examples / (config.batch_size * config.num_gpus)) num_steps_train = math.ceil(train_data.num_examples / (config.batch_size * config.num_gpus)) e_train = evaluator.get_evaluation_from_batches( sess, tqdm(train_data.get_multi_batches(config.batch_size, config.num_gpus, num_steps=num_steps_train), total=num_steps_train) ) # graph_handler.add_summaries(e_test.summaries, global_step) e_dev = evaluator.get_evaluation_from_batches( sess, tqdm(dev_data.get_multi_batches(config.batch_size, config.num_gpus, num_steps=num_steps_dev), total=num_steps_dev)) # graph_handler.dump_eval(e) # graph_handler.add_summaries(e_dev.summaries, global_step) print('train step:{} loss:{} acc:{}'.format(global_step, e_train.loss, e_train.acc)) print('val step:{} loss:{} acc:{}'.format(global_step, e_dev.loss, e_dev.acc)) # print('w_s:{}'.format(w_s)) if global_step > 700: config.save_period = 50 config.eval_period = 50 if best_dev[0] < e_dev.acc: best_dev=[e_dev.acc,global_step,e_train.acc] graph_handler.save(sess, global_step=global_step) # if config.dump_eval: # graph_handler.dump_eval(e_dev) if global_step % config.save_period != 0: graph_handler.save(sess, global_step=global_step) print (best_dev) print ("you can test on test data set and set load setp is {}".format(best_dev[1]))
def main(NMT_config): ### Load RL (global) configurations ### config = parse_args() ### Load trained QA model ### QA_checkpoint = torch.load(config.data_dir + config.QA_best_model) QA_config = QA_checkpoint['config'] QA_mod = BiDAF(QA_config) if QA_config.use_gpu: QA_mod.cuda() QA_mod.load_state_dict(QA_checkpoint['state_dict']) ### Load SQuAD dataset ### data_filter = get_squad_data_filter(QA_config) train_data = read_data(QA_config, 'train', QA_config.load, data_filter=data_filter) dev_data = read_data(QA_config, 'dev', True, data_filter=data_filter) update_config(QA_config, [train_data, dev_data]) print("Total vocabulary for training is %s" % QA_config.word_vocab_size) # from all word2vec_dict = train_data.shared[ 'lower_word2vec'] if QA_config.lower_word else train_data.shared[ 'word2vec'] # from filter-out set word2idx_dict = train_data.shared['word2idx'] # filter-out set idx-vector idx2vec_dict = { word2idx_dict[word]: vec for word, vec in word2vec_dict.items() if word in word2idx_dict } print("{}/{} unique words have corresponding glove vectors.".format( len(idx2vec_dict), len(word2idx_dict))) # <null> and <unk> do not have corresponding vector so random. emb_mat = np.array([ idx2vec_dict[idx] if idx in idx2vec_dict else np.random.multivariate_normal( np.zeros(QA_config.word_emb_size), np.eye(QA_config.word_emb_size)) for idx in range(QA_config.word_vocab_size) ]) config.emb_mat = emb_mat config.new_emb_mat = train_data.shared['new_emb_mat'] num_steps = int( math.ceil(train_data.num_examples / (QA_config.batch_size * QA_config.num_gpus))) * QA_config.num_epochs # offset for question mark NMT_config.max_length = QA_config.ques_size_th - 1 NMT_config.batch_size = QA_config.batch_size ### Construct translator ### translator = make_translator(NMT_config, report_score=True) ### Construct optimizer ### optimizer = optim.SGD(filter(lambda p: p.requires_grad, translator.model.parameters()), lr=config.lr) ### Start RL training ### count = 0 QA_mod.eval() F1_eval = F1Evaluator(QA_config, QA_mod) #eval_model(QA_mod, train_data, dev_data, QA_config, NMT_config, config, translator) for i in range(config.n_episodes): for batches in tqdm(train_data.get_multi_batches( QA_config.batch_size, QA_config.num_gpus, num_steps=num_steps, shuffle=True, cluster=QA_config.cluster), total=num_steps): #for n, p in translator.model.named_parameters(): # print(n) # print(p) #print(p.requires_grad) start = datetime.now() to_input(batches[0][1].data['q'], config.RL_path + config.RL_file) # obtain rewrite and log_prob q, scores, log_prob = translator.translate(NMT_config.src_dir, NMT_config.src, NMT_config.tgt, NMT_config.batch_size, NMT_config.attn_debug) q, cq = ref_query(q) batches[0][1].data['q'] = q batches[0][1].data['cq'] = cq log_prob = torch.stack(log_prob).squeeze(-1) #print(log_prob) translator.model.zero_grad() QA_mod(batches) e = F1_eval.get_evaluation(batches, False, NMT_config, config, translator) reward = Variable(torch.FloatTensor(e.f1s), requires_grad=False) #print(reward) ## Initial loss loss = create_loss(log_prob, reward) loss.backward() optimizer.step()