Пример #1
0
def run_test(test_data, net, rev_emb_dict, end_token, device="cuda"):
    argmax_reward_sum = 0.0
    argmax_reward_count = 0.0
    # p1 is one sentence, p2 is sentence list.
    for p1, p2 in test_data:
        # Transform sentence to padded embeddings.
        input_seq = net.pack_input(p1, net.emb, device)
        # Get hidden states from encoder.
        # enc = net.encode(input_seq)
        context, enc = net.encode_context(input_seq)
        # Decode sequence by feeding predicted token to the net again. Act greedily.
        # Return N*outputvocab, N output token indices.
        _, tokens = net.decode_chain_argmax(enc,
                                            net.emb(beg_token),
                                            seq_len=data.MAX_TOKENS,
                                            context=context[0],
                                            stop_at_token=end_token)
        # Show what the output action sequence is.
        action_tokens = []
        for temp_idx in tokens:
            if temp_idx in rev_emb_dict and rev_emb_dict.get(
                    temp_idx) != '#END':
                action_tokens.append(str(rev_emb_dict.get(temp_idx)).upper())
        # Using 0-1 reward to compute accuracy.
        reward = utils.calc_True_Reward(action_tokens, p2, False)
        # reward = random.random()
        argmax_reward_sum += float(reward)
        argmax_reward_count += 1
    return float(argmax_reward_sum) / float(argmax_reward_count)
def run_test_true_reward(test_data, net, rev_emb_dict, end_token, device="cuda"):
    argmax_reward_sum = 0.0
    argmax_reward_count = 0.0

    # BEGIN token
    beg_token = torch.LongTensor([emb_dict[data.BEGIN_TOKEN]]).to(device)
    beg_token = beg_token.cuda()

    # p1 is one sentence, p2 is sentence list.
    for p1, p2 in test_data:
        p_list = [(p1, p2)]
        input_ids, attention_masks = tokenizer_encode(tokenizer, p_list, rev_emb_dict, device, max_tokens)
        output, output_hidden_states = net.bert_encode(input_ids, attention_masks)
        context, enc = output_hidden_states, (output.unsqueeze(0), output.unsqueeze(0))
        input_seq = net.pack_input(p1, net.emb, device)
        # Return logits (N*outputvocab), res_tokens (1*N)
        # Always use the first token in input sequence, which is '#BEG' as the initial input of decoder.
        # The maximum length of the output is defined in class libbots.data.
        _, tokens = net.decode_chain_argmax(enc, input_seq.data[0:1],
                                            seq_len=data.MAX_TOKENS,
                                            context=context[0],
                                            stop_at_token=end_token)
        action_tokens = []
        for temp_idx in tokens:
            if temp_idx in rev_emb_dict and rev_emb_dict.get(temp_idx) != '#END':
                action_tokens.append(str(rev_emb_dict.get(temp_idx)).upper())
        # Using 0-1 reward to compute accuracy.
        if args.dataset == "csqa":
            argmax_reward_sum += float(utils.calc_True_Reward(action_tokens, p2, False))
        else:
            argmax_reward_sum += float(utils.calc_True_Reward_webqsp_novar(action_tokens, p2, False))

        argmax_reward_count += 1

    if argmax_reward_count == 0:
        return 0.0
    else:
        return float(argmax_reward_sum) / float(argmax_reward_count)
Пример #3
0
            argmax_reward_sum = 0.0
            argmax_reward_count = 0.0
            # test
            # p1 is one sentence, p2 is sentence list.
            for test_task in test_data:
                _, action_tokens = metaLearner.maml_retriever_sampleForTest(
                    task=test_task,
                    old_param_dict=stage1_old_param_dict,
                    docID_dict=docID_dict,
                    rev_docID_dict=rev_docID_dict,
                    emb_dict=emb_dict,
                    qtype_docs_range=qtype_docs_range,
                    steps=args.steps)
                # Using 0-1 reward to compute accuracy.
                argmax_reward_sum += float(
                    utils.calc_True_Reward(action_tokens, test_task[1], False))
                # argmax_reward_sum += random.random()
                argmax_reward_count += 1
            true_reward_test = float(argmax_reward_sum) / float(
                argmax_reward_count)

            # # The parameters are stored after each epoch.
            if best_true_reward is None or best_true_reward < true_reward_test:
                best_true_reward = true_reward_test
                log.info("Best true reward updated: %.4f", true_reward_test)
                # Save the updated seq2seq parameters trained by RL.
                torch.save(
                    net.state_dict(),
                    os.path.join(
                        saves_path, "net_truereward_%.3f_%02d.dat" %
                        (true_reward_test, epoch)))
Пример #4
0
            writer.add_scalar("skipped_samples", retriever_skipped_samples / retriever_total_samples if retriever_total_samples != 0 else 0,
                              retriever_batch_idx)
            log.info("Retriever epoch %d, retriever_skipped_samples: %d, retriever_total_samples: %d", epoch, retriever_skipped_samples, retriever_total_samples)
            writer.add_scalar("epoch", retriever_batch_idx, epoch)
            log.info("---------------------------")
            log.info("Retriever epoch %d, Stage 2 training is over...", epoch)
            log.info("---------------------------")

            argmax_reward_sum = 0.0
            argmax_reward_count = 0.0
            # test
            # p1 is one sentence, p2 is sentence list.
            for test_task in test_data:
                _, action_tokens = metaLearner.maml_retriever_sampleForTest(task=test_task, old_param_dict=stage1_old_param_dict, docID_dict=docID_dict, rev_docID_dict=rev_docID_dict, emb_dict=emb_dict, qtype_docs_range=qtype_docs_range, steps=args.steps)
                # Using 0-1 reward to compute accuracy.
                argmax_reward_sum += float(utils.calc_True_Reward(action_tokens, test_task[1], False))
                # argmax_reward_sum += random.random()
                argmax_reward_count += 1
            true_reward_test = float(argmax_reward_sum) / float(argmax_reward_count)

            # # The parameters are stored after each epoch.
            if best_true_reward is None or best_true_reward < true_reward_test:
                best_true_reward = true_reward_test
                log.info("Best true reward updated: %.4f", true_reward_test)
                # Save the updated seq2seq parameters trained by RL.
                torch.save(net.state_dict(), os.path.join(saves_path, "net_truereward_%.3f_%02d.dat" % (true_reward_test, epoch)))
                torch.save(retriever_net.state_dict(), os.path.join(saves_path, "retriever_truereward_%.3f_%02d.dat" % (true_reward_test, epoch)))
            # # The parameters are stored after each epoch.
            torch.save(net.state_dict(), os.path.join(saves_path, "net_epoch_%03d_%.3f_%.3f.dat" % (epoch, float(maml_true_reward_armax), true_reward_test)))
            torch.save(retriever_net.state_dict(), os.path.join(saves_path, "retriever_epoch_%03d_%.3f_%.3f.dat" % (epoch, float(retriever_true_reward_argmax), float(true_reward_test))))
            log.info("---------------------------")
Пример #5
0
                        item_enc,
                        beg_embedding,
                        data.MAX_TOKENS,
                        context[idx],
                        stop_at_token=end_token)
                    # Show what the output action sequence is.
                    action_tokens = []
                    for temp_idx in actions:
                        if temp_idx in rev_emb_dict and rev_emb_dict.get(
                                temp_idx) != '#END':
                            action_tokens.append(
                                str(rev_emb_dict.get(temp_idx)).upper())
                    # Get the highest BLEU score as baseline used in self-critic.
                    # If the last parameter is false, it means that the 0-1 reward is used to calculate the accuracy.
                    # Otherwise the adaptive reward is used.
                    argmax_reward = utils.calc_True_Reward(
                        action_tokens, qa_info, args.adaptive)
                    # argmax_reward = random.random()
                    true_reward_argmax.append(argmax_reward)

                    if args.NSM and 'pseudo_gold_program_reward' not in qa_info:
                        pseudo_program_tokens = str(
                            qa_info['pseudo_gold_program']).strip().split()
                        pseudo_program_reward = utils.calc_True_Reward(
                            pseudo_program_tokens, qa_info, args.adaptive)
                        qa_info[
                            'pseudo_gold_program_reward'] = pseudo_program_reward

                    # # In this case, the BLEU score is so high that it is not needed to train such case with RL.
                    # if not args.disable_skip and argmax_reward > 0.99:
                    #     skipped_samples += 1
                    #     continue
Пример #6
0
                            beg_embedding,
                            data.MAX_TOKENS,
                            context[idx],
                            stop_at_token=end_token)
                        # Show what the output action sequence is.
                        action_tokens = []
                        for temp_idx in actions:
                            if temp_idx in rev_emb_dict and rev_emb_dict.get(
                                    temp_idx) != '#END':
                                action_tokens.append(
                                    str(rev_emb_dict.get(temp_idx)).upper())
                        # Get the highest BLEU score as baseline used in self-critic.
                        # If the last parameter is false, it means that the 0-1 reward is used to calculate the accuracy.
                        # Otherwise the adaptive reward is used.
                        if args.dataset == "csqa":
                            argmax_reward = utils.calc_True_Reward(
                                action_tokens, qa_info, args.adaptive)
                        else:
                            argmax_reward = utils.calc_True_Reward_webqsp_novar(
                                action_tokens, qa_info, args.adaptive)

                        true_reward_argmax.append(argmax_reward)

                        # # In this case, the BLEU score is so high that it is not needed to train such case with RL.
                        # if not args.disable_skip and argmax_reward > 0.99:
                        #     skipped_samples += 1
                        #     continue

                        # In one epoch, when model is optimized for the first time, the optimized result is displayed here.
                        # After that, all samples in this epoch don't display anymore.
                        if not dial_shown:
                            # data.decode_words transform IDs to tokens.
Пример #7
0
                    # # 'r_argmax' is the list of out_logits list and 'actions' is the list of output tokens.
                    # # The output tokens are generated greedily by using chain_argmax (using last setp's output token as current input token).
                    r_argmax, actions = net.decode_chain_argmax(
                        item_enc,
                        beg_embedding,
                        data.MAX_TOKENS,
                        stop_at_token=end_token)
                    # Show what the output action sequence is.
                    action_tokens = []
                    for temp_idx in actions:
                        if temp_idx in rev_emb_dict and rev_emb_dict.get(
                                temp_idx) != '#END':
                            action_tokens.append(
                                str(rev_emb_dict.get(temp_idx)).upper())
                    # Get the highest BLEU score as baseline used in self-critic.
                    argmax_reward = utils.calc_True_Reward(
                        action_tokens, qa_info)
                    true_reward_argmax.append(argmax_reward)

                    # # In this case, the BLEU score is so high that it is not needed to train such case with RL.
                    # if not args.disable_skip and argmax_reward > 0.99:
                    #     skipped_samples += 1
                    #     continue

                    # In one epoch, when model is optimized for the first time, the optimized result is displayed here.
                    # After that, all samples in this epoch don't display anymore.
                    if not dial_shown:
                        # data.decode_words transform IDs to tokens.
                        log.info(
                            "Input: %s",
                            utils.untokenize(
                                data.decode_words(inp_idx, rev_emb_dict)))