def main(argv): config = Config() config.load_user_config() config.log.info("finish loading user config") train_file = config.args["train_file"] dev_file = config.args["dev_file"] old_glove_file = config.args["glove_file"] new_glove_file = config.args["glove_file"] + ".subset" # TODO(demi): switch "overwrite" to False train_data_raw, dev_data_raw, i2w, w2i, i2c, c2i, new_glove_file, glove_dim, vocab_size, char_vocab_size\ = squad_read_data(config, train_file, dev_file, old_glove_file, new_glove_file, overwrite=True) config.log.info("finish reading squad data in raw formats") config.update_batch([("glove_file", new_glove_file), ("glove_dim", glove_dim), ("vocab_size", vocab_size), ("char_vocab_size", char_vocab_size)]) config.log.warning("reminder: now we only support train/fake mode") assert config.args["mode"] in ["train", "fake"], "mode (%s) not found" % config.args["mode"] train_id_conversion, train_data = make_dataset(config, train_data_raw, w2i, c2i) dev_id_conversion, dev_data = make_dataset(config, dev_data_raw, w2i, c2i) config.log.info("finish making datasets: reformatting raw data") train_data = QnADataset(train_data, config) dev_data = QnADataset(dev_data, config) config.log.info("finish generating datasets") train_loader = torch.utils.data.DataLoader(train_data, batch_size=1, shuffle=True, **config.kwargs) dev_loader = torch.utils.data.DataLoader(dev_data, batch_size=1, **config.kwargs) config.log.info("finish generating data loader") model = BiDAF(config, i2w) config.log.info("finish creating model") if config.args["use_cuda"]: model.cuda() # log config and model config.log.info(config.format_string()) config.log.info("model:{}".format(model)) if config.args['optimizer'] == "Adam": optimizer = optim.Adam(model.get_train_parameters(), lr=config.args['lr'], weight_decay=config.args['weight_decay']) if config.args['optimizer'] == "Adamax": optimizer = optim.Adamax(model.get_train_parameters(), lr=config.args['lr'], weight_decay=config.args['weight_decay']) if config.args['optimizer'] == "SGD": optimizer = torch.optim.SGD(model.get_train_parameters(), lr=config.args['lr'], momentum=0.9, weight_decay=config.args['weight_decay']) if config.args['optimizer'] == "Adadelta": optimizer = torch.optim.Adadelta(model.get_train_parameters(), lr=config.args["lr"]) #if config.args['optimizer'] == "Adagrad": config.log.info("model = %s" % model) config.log.info("config = %s" % config.format_string()) trainer = Trainer(config) evaluator = Evaluator(config) """ save model checkpoint """ def save_checkpoint(epoch): checkpoint = {"model_state_dict": model.state_dict(), "config_args" : config.args} if config.args["optimizer"] != "YF": # YF can't save state dict right now checkpoint["optimizer_state_dict"] = optimizer.state_dict() checkpoint_file = config.args["model_dir"] + config.args["model_name"] + "-EPOCH%d" % epoch torch.save(checkpoint, checkpoint_file) config.log.info("saving checkpoint: {}".format(checkpoint_file)) for epoch in range(1, config.args["max_epoch"] + 1): config.log.info("training: epoch %d" % epoch) # QS(demi): do i need to return model & optimizer? model, optimizer, train_avg_loss, train_answer_dict = trainer.run(model, train_id_conversion[0], train_loader, optimizer, mode="train") model, optimizer, dev_avg_loss, dev_answer_dict = trainer.run(model, dev_id_conversion[0], dev_loader, optimizer, mode="dev") # loss is a float tensor with size 1 config.log.info("[EPOCH %d] LOSS = (train)%.5lf | (dev)%.5lf" % (epoch, train_avg_loss[0], dev_avg_loss[0])) answer_filename = "{}/{}-EPOCH{}".format(config.args["model_dir"], config.args["model_name"], epoch) config.log.info("[EVAUATION] TRAIN EVAL") evaluator.eval("official", train_file, train_answer_dict, "{}/answer.train".format(config.args["model_dir"], answer_filename)) config.log.info("[EVAUATION] DEV EVAL") evaluator.eval("official", dev_file, dev_answer_dict, "{}/answer.dev".format(config.args["model_dir"], answer_filename)) save_checkpoint(epoch)
exact_match += torch.sum(p1_corr * p2_corr).data[0] total += batch_size if i % 10 == 0: print('current acc: {:.3f}%'.format(100*exact_match/total)) print('======== Test result ========') print('p1 acc: {:.3f}%, p2 acc: {:.3f}%, EM: {:.3f}'.format(100.*p1_acc/total, 100.*p2_acc/total, 100.*exact_match/total)) # }}} #create model model = BiDAF(args) if torch.cuda.is_available(): print('use cuda') model.cuda() #resume optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters())) if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) ema = EMA(0.999)
def main(NMT_config): ### Load RL (global) configurations ### config = parse_args() ### Load trained QA model ### QA_checkpoint = torch.load(config.data_dir + config.QA_best_model) QA_config = QA_checkpoint['config'] QA_mod = BiDAF(QA_config) if QA_config.use_gpu: QA_mod.cuda() QA_mod.load_state_dict(QA_checkpoint['state_dict']) ### Load SQuAD dataset ### data_filter = get_squad_data_filter(QA_config) train_data = read_data(QA_config, 'train', QA_config.load, data_filter=data_filter) dev_data = read_data(QA_config, 'dev', True, data_filter=data_filter) update_config(QA_config, [train_data, dev_data]) print("Total vocabulary for training is %s" % QA_config.word_vocab_size) # from all word2vec_dict = train_data.shared[ 'lower_word2vec'] if QA_config.lower_word else train_data.shared[ 'word2vec'] # from filter-out set word2idx_dict = train_data.shared['word2idx'] # filter-out set idx-vector idx2vec_dict = { word2idx_dict[word]: vec for word, vec in word2vec_dict.items() if word in word2idx_dict } print("{}/{} unique words have corresponding glove vectors.".format( len(idx2vec_dict), len(word2idx_dict))) # <null> and <unk> do not have corresponding vector so random. emb_mat = np.array([ idx2vec_dict[idx] if idx in idx2vec_dict else np.random.multivariate_normal( np.zeros(QA_config.word_emb_size), np.eye(QA_config.word_emb_size)) for idx in range(QA_config.word_vocab_size) ]) config.emb_mat = emb_mat config.new_emb_mat = train_data.shared['new_emb_mat'] num_steps = int( math.ceil(train_data.num_examples / (QA_config.batch_size * QA_config.num_gpus))) * QA_config.num_epochs # offset for question mark NMT_config.max_length = QA_config.ques_size_th - 1 NMT_config.batch_size = QA_config.batch_size ### Construct translator ### translator = make_translator(NMT_config, report_score=True) ### Construct optimizer ### optimizer = optim.SGD(filter(lambda p: p.requires_grad, translator.model.parameters()), lr=config.lr) ### Start RL training ### count = 0 QA_mod.eval() F1_eval = F1Evaluator(QA_config, QA_mod) #eval_model(QA_mod, train_data, dev_data, QA_config, NMT_config, config, translator) for i in range(config.n_episodes): for batches in tqdm(train_data.get_multi_batches( QA_config.batch_size, QA_config.num_gpus, num_steps=num_steps, shuffle=True, cluster=QA_config.cluster), total=num_steps): #for n, p in translator.model.named_parameters(): # print(n) # print(p) #print(p.requires_grad) start = datetime.now() to_input(batches[0][1].data['q'], config.RL_path + config.RL_file) # obtain rewrite and log_prob q, scores, log_prob = translator.translate(NMT_config.src_dir, NMT_config.src, NMT_config.tgt, NMT_config.batch_size, NMT_config.attn_debug) q, cq = ref_query(q) batches[0][1].data['q'] = q batches[0][1].data['cq'] = cq log_prob = torch.stack(log_prob).squeeze(-1) #print(log_prob) translator.model.zero_grad() QA_mod(batches) e = F1_eval.get_evaluation(batches, False, NMT_config, config, translator) reward = Variable(torch.FloatTensor(e.f1s), requires_grad=False) #print(reward) ## Initial loss loss = create_loss(log_prob, reward) loss.backward() optimizer.step()