def run_test(test_data, net, rev_emb_dict, end_token, device="cuda"): argmax_reward_sum = 0.0 argmax_reward_count = 0.0 # p1 is one sentence, p2 is sentence list. for p1, p2 in test_data: # Transform sentence to padded embeddings. input_seq = net.pack_input(p1, net.emb, device) # Get hidden states from encoder. # enc = net.encode(input_seq) context, enc = net.encode_context(input_seq) # Decode sequence by feeding predicted token to the net again. Act greedily. # Return N*outputvocab, N output token indices. _, tokens = net.decode_chain_argmax(enc, net.emb(beg_token), seq_len=data.MAX_TOKENS, context=context[0], stop_at_token=end_token) # Show what the output action sequence is. action_tokens = [] for temp_idx in tokens: if temp_idx in rev_emb_dict and rev_emb_dict.get( temp_idx) != '#END': action_tokens.append(str(rev_emb_dict.get(temp_idx)).upper()) # Using 0-1 reward to compute accuracy. reward = utils.calc_True_Reward(action_tokens, p2, False) # reward = random.random() argmax_reward_sum += float(reward) argmax_reward_count += 1 return float(argmax_reward_sum) / float(argmax_reward_count)
def run_test_true_reward(test_data, net, rev_emb_dict, end_token, device="cuda"): argmax_reward_sum = 0.0 argmax_reward_count = 0.0 # BEGIN token beg_token = torch.LongTensor([emb_dict[data.BEGIN_TOKEN]]).to(device) beg_token = beg_token.cuda() # p1 is one sentence, p2 is sentence list. for p1, p2 in test_data: p_list = [(p1, p2)] input_ids, attention_masks = tokenizer_encode(tokenizer, p_list, rev_emb_dict, device, max_tokens) output, output_hidden_states = net.bert_encode(input_ids, attention_masks) context, enc = output_hidden_states, (output.unsqueeze(0), output.unsqueeze(0)) input_seq = net.pack_input(p1, net.emb, device) # Return logits (N*outputvocab), res_tokens (1*N) # Always use the first token in input sequence, which is '#BEG' as the initial input of decoder. # The maximum length of the output is defined in class libbots.data. _, tokens = net.decode_chain_argmax(enc, input_seq.data[0:1], seq_len=data.MAX_TOKENS, context=context[0], stop_at_token=end_token) action_tokens = [] for temp_idx in tokens: if temp_idx in rev_emb_dict and rev_emb_dict.get(temp_idx) != '#END': action_tokens.append(str(rev_emb_dict.get(temp_idx)).upper()) # Using 0-1 reward to compute accuracy. if args.dataset == "csqa": argmax_reward_sum += float(utils.calc_True_Reward(action_tokens, p2, False)) else: argmax_reward_sum += float(utils.calc_True_Reward_webqsp_novar(action_tokens, p2, False)) argmax_reward_count += 1 if argmax_reward_count == 0: return 0.0 else: return float(argmax_reward_sum) / float(argmax_reward_count)
argmax_reward_sum = 0.0 argmax_reward_count = 0.0 # test # p1 is one sentence, p2 is sentence list. for test_task in test_data: _, action_tokens = metaLearner.maml_retriever_sampleForTest( task=test_task, old_param_dict=stage1_old_param_dict, docID_dict=docID_dict, rev_docID_dict=rev_docID_dict, emb_dict=emb_dict, qtype_docs_range=qtype_docs_range, steps=args.steps) # Using 0-1 reward to compute accuracy. argmax_reward_sum += float( utils.calc_True_Reward(action_tokens, test_task[1], False)) # argmax_reward_sum += random.random() argmax_reward_count += 1 true_reward_test = float(argmax_reward_sum) / float( argmax_reward_count) # # The parameters are stored after each epoch. if best_true_reward is None or best_true_reward < true_reward_test: best_true_reward = true_reward_test log.info("Best true reward updated: %.4f", true_reward_test) # Save the updated seq2seq parameters trained by RL. torch.save( net.state_dict(), os.path.join( saves_path, "net_truereward_%.3f_%02d.dat" % (true_reward_test, epoch)))
writer.add_scalar("skipped_samples", retriever_skipped_samples / retriever_total_samples if retriever_total_samples != 0 else 0, retriever_batch_idx) log.info("Retriever epoch %d, retriever_skipped_samples: %d, retriever_total_samples: %d", epoch, retriever_skipped_samples, retriever_total_samples) writer.add_scalar("epoch", retriever_batch_idx, epoch) log.info("---------------------------") log.info("Retriever epoch %d, Stage 2 training is over...", epoch) log.info("---------------------------") argmax_reward_sum = 0.0 argmax_reward_count = 0.0 # test # p1 is one sentence, p2 is sentence list. for test_task in test_data: _, action_tokens = metaLearner.maml_retriever_sampleForTest(task=test_task, old_param_dict=stage1_old_param_dict, docID_dict=docID_dict, rev_docID_dict=rev_docID_dict, emb_dict=emb_dict, qtype_docs_range=qtype_docs_range, steps=args.steps) # Using 0-1 reward to compute accuracy. argmax_reward_sum += float(utils.calc_True_Reward(action_tokens, test_task[1], False)) # argmax_reward_sum += random.random() argmax_reward_count += 1 true_reward_test = float(argmax_reward_sum) / float(argmax_reward_count) # # The parameters are stored after each epoch. if best_true_reward is None or best_true_reward < true_reward_test: best_true_reward = true_reward_test log.info("Best true reward updated: %.4f", true_reward_test) # Save the updated seq2seq parameters trained by RL. torch.save(net.state_dict(), os.path.join(saves_path, "net_truereward_%.3f_%02d.dat" % (true_reward_test, epoch))) torch.save(retriever_net.state_dict(), os.path.join(saves_path, "retriever_truereward_%.3f_%02d.dat" % (true_reward_test, epoch))) # # The parameters are stored after each epoch. torch.save(net.state_dict(), os.path.join(saves_path, "net_epoch_%03d_%.3f_%.3f.dat" % (epoch, float(maml_true_reward_armax), true_reward_test))) torch.save(retriever_net.state_dict(), os.path.join(saves_path, "retriever_epoch_%03d_%.3f_%.3f.dat" % (epoch, float(retriever_true_reward_argmax), float(true_reward_test)))) log.info("---------------------------")
item_enc, beg_embedding, data.MAX_TOKENS, context[idx], stop_at_token=end_token) # Show what the output action sequence is. action_tokens = [] for temp_idx in actions: if temp_idx in rev_emb_dict and rev_emb_dict.get( temp_idx) != '#END': action_tokens.append( str(rev_emb_dict.get(temp_idx)).upper()) # Get the highest BLEU score as baseline used in self-critic. # If the last parameter is false, it means that the 0-1 reward is used to calculate the accuracy. # Otherwise the adaptive reward is used. argmax_reward = utils.calc_True_Reward( action_tokens, qa_info, args.adaptive) # argmax_reward = random.random() true_reward_argmax.append(argmax_reward) if args.NSM and 'pseudo_gold_program_reward' not in qa_info: pseudo_program_tokens = str( qa_info['pseudo_gold_program']).strip().split() pseudo_program_reward = utils.calc_True_Reward( pseudo_program_tokens, qa_info, args.adaptive) qa_info[ 'pseudo_gold_program_reward'] = pseudo_program_reward # # In this case, the BLEU score is so high that it is not needed to train such case with RL. # if not args.disable_skip and argmax_reward > 0.99: # skipped_samples += 1 # continue
beg_embedding, data.MAX_TOKENS, context[idx], stop_at_token=end_token) # Show what the output action sequence is. action_tokens = [] for temp_idx in actions: if temp_idx in rev_emb_dict and rev_emb_dict.get( temp_idx) != '#END': action_tokens.append( str(rev_emb_dict.get(temp_idx)).upper()) # Get the highest BLEU score as baseline used in self-critic. # If the last parameter is false, it means that the 0-1 reward is used to calculate the accuracy. # Otherwise the adaptive reward is used. if args.dataset == "csqa": argmax_reward = utils.calc_True_Reward( action_tokens, qa_info, args.adaptive) else: argmax_reward = utils.calc_True_Reward_webqsp_novar( action_tokens, qa_info, args.adaptive) true_reward_argmax.append(argmax_reward) # # In this case, the BLEU score is so high that it is not needed to train such case with RL. # if not args.disable_skip and argmax_reward > 0.99: # skipped_samples += 1 # continue # In one epoch, when model is optimized for the first time, the optimized result is displayed here. # After that, all samples in this epoch don't display anymore. if not dial_shown: # data.decode_words transform IDs to tokens.
# # 'r_argmax' is the list of out_logits list and 'actions' is the list of output tokens. # # The output tokens are generated greedily by using chain_argmax (using last setp's output token as current input token). r_argmax, actions = net.decode_chain_argmax( item_enc, beg_embedding, data.MAX_TOKENS, stop_at_token=end_token) # Show what the output action sequence is. action_tokens = [] for temp_idx in actions: if temp_idx in rev_emb_dict and rev_emb_dict.get( temp_idx) != '#END': action_tokens.append( str(rev_emb_dict.get(temp_idx)).upper()) # Get the highest BLEU score as baseline used in self-critic. argmax_reward = utils.calc_True_Reward( action_tokens, qa_info) true_reward_argmax.append(argmax_reward) # # In this case, the BLEU score is so high that it is not needed to train such case with RL. # if not args.disable_skip and argmax_reward > 0.99: # skipped_samples += 1 # continue # In one epoch, when model is optimized for the first time, the optimized result is displayed here. # After that, all samples in this epoch don't display anymore. if not dial_shown: # data.decode_words transform IDs to tokens. log.info( "Input: %s", utils.untokenize( data.decode_words(inp_idx, rev_emb_dict)))