valid_dataloader = DataLoader(valid_dataset, shuffle=True, batch_size=hyper_params["batch_size"], num_workers=4) print("Length of training data loader is:", len(train_dataloader)) print("Length of valid data loader is:", len(valid_dataloader)) # load the model model = BiDAF(word_vectors=word_embedding_matrix, char_vectors=char_embedding_matrix, hidden_size=hyper_params["hidden_size"], drop_prob=hyper_params["drop_prob"]) if hyper_params["pretrained"]: model.load_state_dict(torch.load(os.path.join(experiment_path, "model.pkl"))["state_dict"]) model.to(device) # define loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adadelta(model.parameters(), hyper_params["learning_rate"], weight_decay=1e-4) # best loss so far if hyper_params["pretrained"]: best_valid_loss = torch.load(os.path.join(experiment_path, "model.pkl"))["best_valid_loss"] epoch_checkpoint = torch.load(os.path.join(experiment_path, "model_last_checkpoint.pkl"))["epoch"] print("Best validation loss obtained after {} epochs is: {}".format(epoch_checkpoint, best_valid_loss)) else: best_valid_loss = 100 epoch_checkpoint = 0
#create model model = BiDAF(args) if torch.cuda.is_available(): print('use cuda') model.cuda() #resume optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters())) if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) ema = EMA(0.999) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) print('parameters-----') for name, param in model.named_parameters(): if param.requires_grad: print(name, param.data.size())
def eval(context, question): with open(os.path.join(config.data_dir, "train", "word2idx.pkl"), "rb") as wi, \ open(os.path.join(config.data_dir, "train", "char2idx.pkl"), "rb") as ci, \ open(os.path.join(config.data_dir, "train", "word_embeddings.pkl"), "rb") as wb, \ open(os.path.join(config.data_dir, "train", "char_embeddings.pkl"), "rb") as cb: word2idx = pickle.load(wi) char2idx = pickle.load(ci) word_embedding_matrix = pickle.load(wb) char_embedding_matrix = pickle.load(cb) # transform them into Tensors word_embedding_matrix = torch.from_numpy( np.array(word_embedding_matrix)).type(torch.float32) char_embedding_matrix = torch.from_numpy( np.array(char_embedding_matrix)).type(torch.float32) idx2word = dict([(y, x) for x, y in word2idx.items()]) context = clean_text(context) context = [w for w in word_tokenize(context) if w] question = clean_text(question) question = [w for w in word_tokenize(question) if w] if len(context) > config.max_len_context: print("The context is too long. Maximum accepted length is", config.max_len_context, "words.") if max([len(w) for w in context]) > config.max_len_word: print("Some words in the context are longer than", config.max_len_word, "characters.") if len(question) > config.max_len_question: print("The question is too long. Maximum accepted length is", config.max_len_question, "words.") if max([len(w) for w in question]) > config.max_len_word: print("Some words in the question are longer than", config.max_len_word, "characters.") if len(question) < 3: print( "The question is too short. It needs to be at least a three words question." ) context_idx = np.zeros([config.max_len_context], dtype=np.int32) question_idx = np.zeros([config.max_len_question], dtype=np.int32) context_char_idx = np.zeros([config.max_len_context, config.max_len_word], dtype=np.int32) question_char_idx = np.zeros( [config.max_len_question, config.max_len_word], dtype=np.int32) # replace 0 values with word and char IDs for j, word in enumerate(context): if word in word2idx: context_idx[j] = word2idx[word] else: context_idx[j] = 1 for k, char in enumerate(word): if char in char2idx: context_char_idx[j, k] = char2idx[char] else: context_char_idx[j, k] = 1 for j, word in enumerate(question): if word in word2idx: question_idx[j] = word2idx[word] else: question_idx[j] = 1 for k, char in enumerate(word): if char in char2idx: question_char_idx[j, k] = char2idx[char] else: question_char_idx[j, k] = 1 model = BiDAF(word_vectors=word_embedding_matrix, char_vectors=char_embedding_matrix, hidden_size=config.hidden_size, drop_prob=config.drop_prob) try: if config.cuda: model.load_state_dict( torch.load(os.path.join(config.squad_models, "model_final.pkl"))["state_dict"]) else: model.load_state_dict( torch.load( os.path.join(config.squad_models, "model_final.pkl"), map_location=lambda storage, loc: storage)["state_dict"]) print("Model weights successfully loaded.") except: pass print( "Model weights not found, initialized model with random weights.") model.to(device) model.eval() with torch.no_grad(): context_idx, context_char_idx, question_idx, question_char_idx = torch.tensor(context_idx, dtype=torch.int64).unsqueeze(0).to(device),\ torch.tensor(context_char_idx, dtype=torch.int64).unsqueeze(0).to(device),\ torch.tensor(question_idx, dtype=torch.int64).unsqueeze(0).to(device),\ torch.tensor(question_char_idx, dtype=torch.int64).unsqueeze(0).to(device) pred1, pred2 = model(context_idx, context_char_idx, question_idx, question_char_idx) starts, ends = discretize(pred1.exp(), pred2.exp(), 15, False) prediction = " ".join(context[starts.item():ends.item() + 1]) return prediction
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=hyper_params["batch_size"], num_workers=4) print("Length of test data loader is:", len(test_dataloader)) # load the model model = BiDAF(word_vectors=word_embedding_matrix, char_vectors=char_embedding_matrix, hidden_size=hyper_params["hidden_size"], drop_prob=hyper_params["drop_prob"]) try: if config.cuda: model.load_state_dict( torch.load(os.path.join(config.squad_models, "model_final.pkl"))["state_dict"]) else: model.load_state_dict( torch.load( os.path.join(config.squad_models, "model_final.pkl"), map_location=lambda storage, loc: storage)["state_dict"]) print("Model weights successfully loaded.") except: print("Model weights not found, initialized model with random weights.") model.to(device) # define loss criterion criterion = nn.CrossEntropyLoss() model.eval()
def main(NMT_config): ### Load RL (global) configurations ### config = parse_args() ### Load trained QA model ### QA_checkpoint = torch.load(config.data_dir + config.QA_best_model) QA_config = QA_checkpoint['config'] QA_mod = BiDAF(QA_config) if QA_config.use_gpu: QA_mod.cuda() QA_mod.load_state_dict(QA_checkpoint['state_dict']) ### Load SQuAD dataset ### data_filter = get_squad_data_filter(QA_config) train_data = read_data(QA_config, 'train', QA_config.load, data_filter=data_filter) dev_data = read_data(QA_config, 'dev', True, data_filter=data_filter) update_config(QA_config, [train_data, dev_data]) print("Total vocabulary for training is %s" % QA_config.word_vocab_size) # from all word2vec_dict = train_data.shared[ 'lower_word2vec'] if QA_config.lower_word else train_data.shared[ 'word2vec'] # from filter-out set word2idx_dict = train_data.shared['word2idx'] # filter-out set idx-vector idx2vec_dict = { word2idx_dict[word]: vec for word, vec in word2vec_dict.items() if word in word2idx_dict } print("{}/{} unique words have corresponding glove vectors.".format( len(idx2vec_dict), len(word2idx_dict))) # <null> and <unk> do not have corresponding vector so random. emb_mat = np.array([ idx2vec_dict[idx] if idx in idx2vec_dict else np.random.multivariate_normal( np.zeros(QA_config.word_emb_size), np.eye(QA_config.word_emb_size)) for idx in range(QA_config.word_vocab_size) ]) config.emb_mat = emb_mat config.new_emb_mat = train_data.shared['new_emb_mat'] num_steps = int( math.ceil(train_data.num_examples / (QA_config.batch_size * QA_config.num_gpus))) * QA_config.num_epochs # offset for question mark NMT_config.max_length = QA_config.ques_size_th - 1 NMT_config.batch_size = QA_config.batch_size ### Construct translator ### translator = make_translator(NMT_config, report_score=True) ### Construct optimizer ### optimizer = optim.SGD(filter(lambda p: p.requires_grad, translator.model.parameters()), lr=config.lr) ### Start RL training ### count = 0 QA_mod.eval() F1_eval = F1Evaluator(QA_config, QA_mod) #eval_model(QA_mod, train_data, dev_data, QA_config, NMT_config, config, translator) for i in range(config.n_episodes): for batches in tqdm(train_data.get_multi_batches( QA_config.batch_size, QA_config.num_gpus, num_steps=num_steps, shuffle=True, cluster=QA_config.cluster), total=num_steps): #for n, p in translator.model.named_parameters(): # print(n) # print(p) #print(p.requires_grad) start = datetime.now() to_input(batches[0][1].data['q'], config.RL_path + config.RL_file) # obtain rewrite and log_prob q, scores, log_prob = translator.translate(NMT_config.src_dir, NMT_config.src, NMT_config.tgt, NMT_config.batch_size, NMT_config.attn_debug) q, cq = ref_query(q) batches[0][1].data['q'] = q batches[0][1].data['cq'] = cq log_prob = torch.stack(log_prob).squeeze(-1) #print(log_prob) translator.model.zero_grad() QA_mod(batches) e = F1_eval.get_evaluation(batches, False, NMT_config, config, translator) reward = Variable(torch.FloatTensor(e.f1s), requires_grad=False) #print(reward) ## Initial loss loss = create_loss(log_prob, reward) loss.backward() optimizer.step()