Пример #1
0
 def expert_generator(self):
     env = self.env_list[0]
     traj_len = 40
     reward_tot, turn_tot, inform_tot, match_tot, success_tot = [], [], [], [], []
     success_dialog = []
     while len(success_dialog)<10000:
         seed = np.random.randint(2000000)
         s = env.reset(seed)
         print('seed', seed)
         print('goal', env.goal.domain_goals)
         print('usr', s['user_action'])
         turn = traj_len
         reward = []
         value = []
         mask = []
         dialog_turn = []
         for t in range(traj_len):
             s_vec = torch.Tensor(state_vectorize(s, env.cfg, env.db)).to(device=DEVICE)
             # mode with policy during evaluation
             a = self.policy.select_action(s_vec, False)
             next_s, done = env.step(s, a.cpu())
             next_s_vec = torch.Tensor(state_vectorize(next_s, env.cfg, env.db)).to(device=DEVICE)
             r = self.reward_human(s, done)
             pair = (s_vec, a, next_s_vec, r, done)
             dialog_turn.append(copy.deepcopy(pair))
             s = next_s
             if done:
                 mask.append(0)
                 turn = t+2 # one due to counting from 0, the one for the last turn
                 break
         if r > 0:
             success_dialog += dialog_turn
             logging.info("success dialog: {}".format(len(success_dialog)))
     torch.save(success_dialog, './data/expert_dialog_art.pt')
Пример #2
0
    def create_dataset(self, part, file_dir, cfg, db):
        datas = self.data[part]
        goals = self.goal[part]
        s, a, next_s, a_seq = [], [], [], []
        d_m = []
        for idx, turn_data in enumerate(datas):
            if turn_data['others']['turn'] % 2 == 0:
                continue
            turn_data['user_goal'] = goals[turn_data['others']['session_id']]
            s.append(torch.Tensor(state_vectorize(turn_data, cfg, db, True)))
            a.append(torch.Tensor(action_vectorize(turn_data, cfg)))
            d_m.append(torch.Tensor(domain_vectorize(turn_data, cfg)))
            a_seq.append(torch.Tensor(action_seq(turn_data, cfg)))

            if not int(turn_data['others']['terminal']):
                next_s.append(
                    torch.Tensor(state_vectorize(datas[idx + 2], cfg, db,
                                                 True)))
            else:
                next_turn_data = deepcopy(turn_data)
                next_turn_data['others']['turn'] = -1
                next_turn_data['user_action'] = {}
                next_turn_data['last_sys_action'] = next_turn_data[
                    'sys_action']
                next_turn_data['sys_action'] = {}
                next_turn_data['belief_state'] = next_turn_data[
                    'next_belief_state']
                next_s.append(
                    torch.Tensor(state_vectorize(next_turn_data, cfg, db,
                                                 True)))
        torch.save((s, a, next_s, d_m, a_seq), file_dir)
Пример #3
0
    def evaluate_with_agenda(self, env, N):
        """
        和上面不同的是,这里是使用定义好的env,上面实现的是内置的env,
        用于针对系统agent,专门进行验证的
        """
        logging.info('eval: agenda 2 system')
        traj_len = 40
        turn_tot, inform_tot, match_tot, success_tot = [], [], [], []
        for seed in range(N):
            #            s = env.reset(seed)
            s = env.reset()
            print('seed', seed)
            print('goal', env.goal.domain_goals)
            print('usr', s['user_action'])
            for t in range(traj_len):
                s_vec = torch.Tensor(state_vectorize(s, env.cfg,
                                                     env.db)).to(device=DEVICE)
                # mode with policy during evaluation
                a = self.policy_sys.select_action(s_vec, False)
                # 这一步操作,应该已经在env中包含了用户的操作,以及状态变更
                next_s, done = env.step(s, a.cpu())
                s = next_s
                print('sys', s['sys_action'])
                print('usr', s['user_action'])
                if done:
                    break
            s_vec = torch.Tensor(state_vectorize(s, env.cfg,
                                                 env.db)).to(device=DEVICE)
            # mode with policy during evaluation
            a = self.policy_sys.select_action(s_vec, False)
            s = env.update_belief_sys(s, a.cpu())
            print('sys', s['sys_action'])

            assert (env.time_step % 2 == 0)
            turn_tot.append(env.time_step // 2)
            match_tot += env.evaluator.match_rate(aggregate=False)
            inform_tot.append(env.evaluator.inform_F1(aggregate=False))
            print('turn', env.time_step // 2)
            match_session = env.evaluator.match_rate()
            print('match', match_session)
            inform_session = env.evaluator.inform_F1()
            print('inform', inform_session)
            if (match_session == 1 and inform_session[1] == 1) \
                    or (match_session == 1 and inform_session[1] is None) \
                    or (match_session is None and inform_session[1] == 1):
                print('success', 1)
                success_tot.append(1)
            else:
                print('success', 0)
                success_tot.append(0)

        logging.info('turn {}'.format(np.mean(turn_tot)))
        logging.info('match {}'.format(np.mean(match_tot)))
        TP, FP, FN = np.sum(inform_tot, 0)
        prec = TP / (TP + FP)
        rec = TP / (TP + FN)
        F1 = 2 * prec * rec / (prec + rec)
        logging.info('inform rec {}, F1 {}'.format(rec, F1))
        logging.info('success {}'.format(np.mean(success_tot)))
Пример #4
0
    def create_dataset_global(self, part, file_dir, data_dir, cfg, db):
        datas = self.data[part]
        goals = self.goal[part]
        s_usr, s_sys, r_g, next_s_usr, next_s_sys, t = [], [], [], [], [], []
        evaluator = MultiWozEvaluator(data_dir)
        for idx, turn_data in enumerate(datas):
            if turn_data['others']['turn'] % 2 == 0:
                if turn_data['others']['turn'] == 0:
                    current_goal = goals[turn_data['others']['session_id']]
                    evaluator.add_goal(current_goal)
                else:
                    next_s_usr.append(s_usr[-1])
                
                if turn_data['others']['change'] and evaluator.cur_domain:
                    if 'final' in current_goal[evaluator.cur_domain]:
                        for key in current_goal[evaluator.cur_domain]['final']:
                            current_goal[evaluator.cur_domain][key] = current_goal[evaluator.cur_domain]['final'][key]
                        del(current_goal[evaluator.cur_domain]['final'])
                
                turn_data['user_goal'] = deepcopy(current_goal)
                s_usr.append(torch.Tensor(state_vectorize_user(turn_data, cfg, evaluator.cur_domain)))
                evaluator.add_usr_da(turn_data['trg_user_action'])
                    
                if turn_data['others']['terminal']:
                    next_turn_data = deepcopy(turn_data)
                    next_turn_data['others']['turn'] = -1
                    next_turn_data['user_action'] = turn_data['trg_user_action']
                    next_turn_data['sys_action'] = datas[idx+1]['trg_sys_action']
                    next_turn_data['trg_user_action'] = {}
                    next_turn_data['goal_state'] = datas[idx+1]['final_goal_state']
                    next_s_usr.append(torch.Tensor(state_vectorize_user(next_turn_data, cfg, evaluator.cur_domain)))
            
            else:
                if turn_data['others']['turn'] != 1:
                    next_s_sys.append(s_sys[-1])

                s_sys.append(torch.Tensor(state_vectorize(turn_data, cfg, db, True)))
                evaluator.add_sys_da(turn_data['trg_sys_action'])
            
                if turn_data['others']['terminal']:
                    next_turn_data = deepcopy(turn_data)
                    next_turn_data['others']['turn'] = -1
                    next_turn_data['user_action'] = {}
                    next_turn_data['sys_action'] = turn_data['trg_sys_action']
                    next_turn_data['trg_sys_action'] = {}
                    next_turn_data['belief_state'] = turn_data['final_belief_state']
                    next_s_sys.append(torch.Tensor(state_vectorize(next_turn_data, cfg, db, True)))
                    reward_g = 20 if evaluator.task_success() else -5
                    r_g.append(reward_g)
                    t.append(1)
                else:
                    reward_g = 5 if evaluator.cur_domain and evaluator.domain_success(evaluator.cur_domain) else -1
                    r_g.append(reward_g)
                    t.append(0)
                
        torch.save((s_usr, s_sys, r_g, next_s_usr, next_s_sys, t), file_dir)
Пример #5
0
    def create_dataset_sys(self, part, file_dir, data_dir, cfg, db):
        datas = self.data[part]
        goals = self.goal[part]
        s, a, r, next_s, t = [], [], [], [], []
        evaluator = MultiWozEvaluator(data_dir)
        for idx, turn_data in enumerate(datas):
            if turn_data['others']['turn'] % 2 == 0:
                if turn_data['others']['turn'] == 0:
                    evaluator.add_goal(
                        goals[turn_data['others']['session_id']])
                evaluator.add_usr_da(turn_data['trg_user_action'])
                continue
            if turn_data['others']['turn'] != 1:
                next_s.append(s[-1])

            s.append(torch.Tensor(state_vectorize(turn_data, cfg, db, True)))
            a.append(
                torch.Tensor(action_vectorize(turn_data['trg_sys_action'],
                                              cfg)))
            evaluator.add_sys_da(turn_data['trg_sys_action'])
            if turn_data['others']['terminal']:
                next_turn_data = deepcopy(turn_data)
                next_turn_data['others']['turn'] = -1
                next_turn_data['user_action'] = {}
                next_turn_data['sys_action'] = turn_data['trg_sys_action']
                next_turn_data['trg_sys_action'] = {}
                next_turn_data['belief_state'] = turn_data[
                    'final_belief_state']
                next_s.append(
                    torch.Tensor(state_vectorize(next_turn_data, cfg, db,
                                                 True)))
                reward = 20 if evaluator.task_success(False) else -5
                r.append(reward)
                t.append(1)
            else:
                reward = 0
                if evaluator.cur_domain:
                    for slot, value in turn_data['belief_state'][
                            evaluator.cur_domain].items():
                        if value == '?':
                            for da in turn_data['trg_sys_action']:
                                d, i, k, p = da.split('-')
                                if i in [
                                        'inform', 'recommend', 'offerbook',
                                        'offerbooked'
                                ] and k == slot:
                                    break
                            else:
                                # not answer request
                                reward -= 1
                if not turn_data['trg_sys_action']:
                    reward -= 5
                r.append(reward)
                t.append(0)

        torch.save((s, a, r, next_s, t), file_dir)
Пример #6
0
    def evaluate(self, N):
        logging.info('eval: user 2 system')
        env = self.env_list[0]
        traj_len = 40
        turn_tot, inform_tot, match_tot, success_tot = [], [], [], []
        for seed in range(N):
            s = env.reset(seed)
            print('seed', seed)
            print('origin goal', env.goal)
            print('goal', env.evaluator.goal)
            for t in range(traj_len):
                s_vec = torch.Tensor(
                    state_vectorize_user(
                        s, env.cfg,
                        env.evaluator.cur_domain)).to(device=DEVICE)
                # mode with policy during evaluation
                a = self.policy_usr.select_action(s_vec, False)
                next_s, done = env.step_usr(s, a)

                next_s_vec = torch.Tensor(
                    state_vectorize(next_s, env.cfg, env.db)).to(device=DEVICE)
                next_a = self.policy_sys.select_action(next_s_vec, False)
                s = env.step_sys(next_s, next_a)

                print('usr', s['user_action'])
                print('sys', s['sys_action'])

                if done:
                    break

            turn_tot.append(env.time_step // 2)
            match_tot += env.evaluator.match_rate(aggregate=False)
            inform_tot.append(env.evaluator.inform_F1(aggregate=False))
            print('turn', env.time_step // 2)
            match_session = env.evaluator.match_rate()
            print('match', match_session)
            inform_session = env.evaluator.inform_F1()
            print('inform', inform_session)
            if (match_session == 1 and inform_session[1] == 1) \
            or (match_session == 1 and inform_session[1] is None) \
            or (match_session is None and inform_session[1] == 1):
                print('success', 1)
                success_tot.append(1)
            else:
                print('success', 0)
                success_tot.append(0)

        logging.info('turn {}'.format(np.mean(turn_tot)))
        logging.info('match {}'.format(np.mean(match_tot)))
        TP, FP, FN = np.sum(inform_tot, 0)
        prec = TP / (TP + FP)
        rec = TP / (TP + FN)
        F1 = 2 * prec * rec / (prec + rec)
        logging.info('inform rec {}, F1 {}'.format(rec, F1))
        logging.info('success {}'.format(np.mean(success_tot)))
Пример #7
0
def sampler(pid, queue, evt, env, policy_usr, policy_sys, batchsz):
    """
    This is a sampler function, and it will be called by multiprocess.Process to sample data from environment by multiple
    processes.
    随机生成batchsz数量的对话数据, 并记录每一轮的状态和奖励,
    这个对话数据是通过系统生成的,由policy和env共同维护实现

    :param pid: process id
    :param queue: multiprocessing.Queue, to collect sampled data
    :param evt: multiprocessing.Event, to keep the process alive
    :param env: environment instance
    :param policy: policy network, to generate action from current policy
    :param batchsz: total sampled items
    :return:
    """
    buff = Memory()

    # we need to sample batchsz of (state, action, next_state, reward, mask)
    # each trajectory contains `trajectory_len` num of items, so we only need to sample
    # `batchsz//trajectory_len` num of trajectory totally
    # the final sampled number may be larger than batchsz.

    sampled_num = 0
    sampled_traj_num = 0
    traj_len = 40
    real_traj_len = 0

    # sampled_num 表示对话轮数
    while sampled_num < batchsz:
        # for each trajectory, we reset the env and get initial state
        # 初始化状态
        s = env.reset()

        # 完成一个trajectory
        for t in range(traj_len):

            # [s_dim_usr] => [a_dim_usr]
            s_vec = torch.Tensor(
                state_vectorize_user(s, env.cfg, env.evaluator.cur_domain))
            # 选择动作
            a = policy_usr.select_action(s_vec.to(device=DEVICE)).cpu()

            # interact with env, done is a flag indicates ending or not
            next_s, done = env.step_usr(s, a)

            # [s_dim] => [a_dim]
            next_s_vec = torch.Tensor(state_vectorize(next_s, env.cfg, env.db))
            next_a = policy_sys.select_action(
                next_s_vec.to(device=DEVICE)).cpu()

            # interact with env
            s = env.step_sys(next_s, next_a)

            # get reward compared to demonstrations
            if done:
                env.set_rollout(True)
                # 貌似是清理env做的操作
                s_vec_next = torch.Tensor(
                    state_vectorize_user(s, env.cfg, env.evaluator.cur_domain))
                a_next = torch.zeros_like(a)
                next_s_next, _ = env.step_usr(s, a_next)
                next_s_vec_next = torch.Tensor(
                    state_vectorize(next_s_next, env.cfg, env.db))
                env.set_rollout(False)

                r_usr = 20 if env.evaluator.inform_F1(
                    ans_by_sys=False)[1] == 1. else -5
                r_sys = 20 if env.evaluator.task_success(False) else -5
                r_global = 20 if env.evaluator.task_success() else -5
            else:
                # one step roll out
                env.set_rollout(True)
                s_vec_next = torch.Tensor(
                    state_vectorize_user(s, env.cfg, env.evaluator.cur_domain))
                a_next = policy_usr.select_action(
                    s_vec_next.to(device=DEVICE)).cpu()
                next_s_next, _ = env.step_usr(s, a_next)
                next_s_vec_next = torch.Tensor(
                    state_vectorize(next_s_next, env.cfg, env.db))
                env.set_rollout(False)

                r_usr = 0
                if not s['user_action']:
                    # 上一轮没有用户动作
                    r_usr -= 5
                if env.evaluator.cur_domain:
                    for da in s['user_action']:
                        d, i, k = da.split('-')
                        if i == 'request':
                            for slot, value in s['goal_state'][d].items():
                                if value != '?' and slot in s['user_goal'][d] \
                                        and s['user_goal'][d][slot] != '?':
                                    # request before express constraint
                                    r_usr -= 1
                r_sys = 0
                if not next_s['sys_action']:
                    # 上一轮没有系统动作
                    r_sys -= 5
                if env.evaluator.cur_domain:
                    for slot, value in next_s['belief_state'][
                            env.evaluator.cur_domain].items():
                        if value == '?':
                            for da in next_s['sys_action']:
                                d, i, k, p = da.split('-')
                                if i in [
                                        'inform', 'recommend', 'offerbook',
                                        'offerbooked'
                                ] and k == slot:
                                    break
                            else:
                                # not answer request
                                # 重要有一个slot没有回答,就抠一点奖励
                                r_sys -= 1
                r_global = 5 if env.evaluator.cur_domain and env.evaluator.domain_success(
                    env.evaluator.cur_domain) else -1

            # save to queue
            # 训练数据导入到Memory中
            buff.push(s_vec.numpy(), a.numpy(), r_usr, s_vec_next.numpy(),
                      next_s_vec.numpy(), next_a.numpy(), r_sys,
                      next_s_vec_next.numpy(), done, r_global)

            # update per step
            real_traj_len = t

            if done:
                break

        # this is end of one trajectory
        sampled_num += real_traj_len
        sampled_traj_num += 1
        # t indicates the valid trajectory length

    # this is end of sampling all batchsz of items.
    # when sampling is over, push all buff data into queue
    queue.put([pid, buff])
    evt.wait()
Пример #8
0
def sampler(pid, queue, evt, env, policy, batchsz, human_reward):
    """
    This is a sampler function, and it will be called by multiprocess.Process to sample data from environment by multiple
    processes.
    :param pid: process id
    :param queue: multiprocessing.Queue, to collect sampled data
    :param evt: multiprocessing.Event, to keep the process alive
    :param env: environment instance
    :param policy: policy network, to generate action from current policy
    :param batchsz: total sampled items
    :return:
    """
    buff = Memory()
    # human_reward = human_reward()

    # we need to sample batchsz of (state, action, next_state, reward, mask)
    # each trajectory contains `trajectory_len` num of items, so we only need to sample
    # `batchsz//trajectory_len` num of trajectory totally
    # the final sampled number may be larger than batchsz.

    sampled_num = 0
    sampled_traj_num = 0
    traj_len = 40
    real_traj_len = 0

    while sampled_num < batchsz:
        # for each trajectory, we reset the env and get initial state
        s = env.reset()

        for t in range(traj_len):

            # [s_dim] => [a_dim]
            s_vec = torch.Tensor(state_vectorize(s, env.cfg, env.db))
            a = policy.select_action(s_vec.to(device=DEVICE)).cpu()
            # print(a.shape)
            d = torch.Tensor(domain_vectorize(s, env.cfg))
            # interact with env
            next_s, done = env.step(s, a)

            # a flag indicates ending or not
            mask = 0 if done else 1

            # get reward compared to demostrations
            next_s_vec = torch.Tensor(state_vectorize(next_s, env.cfg, env.db))
            r = human_reward.reward_human(next_s, done)

            # save to queue
            buff.push(s_vec.numpy(), a.numpy(), mask, next_s_vec.numpy(), r,
                      d.numpy())

            # update per step
            s = next_s
            real_traj_len = t

            if done:
                break

        # this is end of one trajectory
        sampled_num += real_traj_len
        sampled_traj_num += 1
        # t indicates the valid trajectory length

    # this is end of sampling all batchsz of items.
    # when sampling is over, push all buff data into queue
    queue.put([pid, buff])
    evt.wait()
Пример #9
0
    def evaluate(self, save_dialog=False):
        self.policy.eval()
        if save_dialog:
            with open('./data/goal.json', 'r') as f:
                saved_goal_list = json.load(f)
        collected_dialog = []

        env = self.env_list[0]
        traj_len = 40
        reward_tot, turn_tot, inform_tot, match_tot, success_tot = [], [], [], [], []
        for seed in range(1000):
            dialog_list = []
            if save_dialog:
                s = env.reset(seed, saved_goal=saved_goal_list[seed])
            else:
                s = env.reset(seed, saved_goal=None)
            # print('seed', seed)
            # print('goal', env.goal.domain_goals)
            # print('usr', s['user_action'])
            dialog_list.append(s['user_action'])
            turn = traj_len
            reward = []
            value = []
            mask = []
            for t in range(traj_len):
                s_vec = torch.Tensor(state_vectorize(s, env.cfg,
                                                     env.db)).to(device=DEVICE)
                # mode with policy during evaluation
                a = self.policy.select_action(s_vec, False)
                next_s, done = env.step(s, a.cpu())
                next_s_vec = torch.Tensor(
                    state_vectorize(next_s, env.cfg, env.db)).to(device=DEVICE)
                r = self.human_reward.reward_human(next_s, done)
                reward.append(r)
                s = next_s
                dialog_list.append(s['last_sys_action'])
                dialog_list.append(s['user_action'])
                # print('sys', s['last_sys_action'])
                # print('usr', s['user_action'])
                if done:
                    mask.append(0)
                    turn = t + 2  # one due to counting from 0, the one for the last turn
                    break
                mask.append(1)

            reward_tot.append(np.mean(reward))
            turn_tot.append(turn)
            match_tot += self.evaluator.match_rate(s)
            inform_tot.append(self.evaluator.inform_F1(s))
            reward = torch.Tensor(reward)
            mask = torch.LongTensor(mask)
            # print('turn', turn)
            match_session = self.evaluator.match_rate(s, True)
            # print('match', match_session)
            inform_session = self.evaluator.inform_F1(s, True)
            # print('inform', inform_session)
            if (match_session == 1 and inform_session[1] == 1) \
            or (match_session == 1 and inform_session[1] is None) \
            or (match_session is None and inform_session[1] == 1):
                # print('success', 1)
                success_tot.append(1)
            else:
                # print('success', 0)
                success_tot.append(0)
            dialog_dict = {
                'goal id': seed,
                'goal': env.goal.domain_goals,
                'dialog': dialog_list,
                'turn': turn,
                'status': success_tot[-1]
            }
            collected_dialog.append(dialog_dict)

        logging.info('reward {}'.format(np.mean(reward_tot)))
        logging.info('turn {}'.format(np.mean(turn_tot)))
        logging.info('match {}'.format(np.mean(match_tot)))
        TP, FP, FN = np.sum(inform_tot, 0)
        prec = TP / (TP + FP)
        rec = TP / (TP + FN)
        F1 = 2 * prec * rec / (prec + rec)
        logging.info('inform rec {}, F1 {}'.format(rec, F1))
        logging.info('success {}'.format(np.mean(success_tot)))

        if save_dialog:
            self.save_dialog(self.save_dir, collected_dialog)
Пример #10
0
    def create_dataset_global(self, part, file_dir, data_dir, cfg, db):
        """
        创建global数据,这个数据记录了用户侧和系统侧的所有状态以及奖励
        """
        datas = self.data[part]
        goals = self.goal[part]
        s_usr, s_sys, r_g, next_s_usr, next_s_sys, t = [], [], [], [], [], []
        evaluator = MultiWozEvaluator(data_dir, cfg.d)
        for idx, turn_data in enumerate(datas):
            if turn_data['others']['turn'] % 2 == 0:
                if turn_data['others']['turn'] == 0:
                    current_goal = goals[turn_data['others']['session_id']]
                    evaluator.add_goal(current_goal)
                else:
                    next_s_usr.append(s_usr[-1])

                # 当用户目标无法满足时,切换用户目标
                if turn_data['others']['change'] and evaluator.cur_domain:
                    if 'final' in current_goal[evaluator.cur_domain]:
                        for key in current_goal[evaluator.cur_domain]['final']:
                            current_goal[
                                evaluator.cur_domain][key] = current_goal[
                                    evaluator.cur_domain]['final'][key]
                        del (current_goal[evaluator.cur_domain]['final'])
                turn_data['user_goal'] = deepcopy(current_goal)

                s_usr.append(
                    torch.Tensor(
                        state_vectorize_user(turn_data, cfg,
                                             evaluator.cur_domain)))
                evaluator.add_usr_da(turn_data['trg_user_action'])

                if turn_data['others']['terminal']:
                    next_turn_data = deepcopy(turn_data)
                    next_turn_data['others']['turn'] = -1
                    next_turn_data['user_action'] = turn_data[
                        'trg_user_action']
                    next_turn_data['sys_action'] = datas[idx +
                                                         1]['trg_sys_action']
                    next_turn_data['trg_user_action'] = {}
                    next_turn_data['goal_state'] = datas[idx +
                                                         1]['final_goal_state']
                    next_s_usr.append(
                        torch.Tensor(
                            state_vectorize_user(next_turn_data, cfg,
                                                 evaluator.cur_domain)))

            else:
                if turn_data['others']['turn'] != 1:
                    next_s_sys.append(s_sys[-1])

                s_sys.append(
                    torch.Tensor(state_vectorize(turn_data, cfg, db, True)))
                evaluator.add_sys_da(turn_data['trg_sys_action'])

                if turn_data['others']['terminal']:
                    next_turn_data = deepcopy(turn_data)
                    next_turn_data['others']['turn'] = -1
                    next_turn_data['user_action'] = {}
                    next_turn_data['sys_action'] = turn_data['trg_sys_action']
                    next_turn_data['trg_sys_action'] = {}
                    next_turn_data['belief_state'] = turn_data[
                        'final_belief_state']
                    next_s_sys.append(
                        torch.Tensor(
                            state_vectorize(next_turn_data, cfg, db, True)))
                    # 由于多轮对话系统,默认最终都是系统说结束语,因此通过系统判断任务是否成功作为整体的奖励
                    reward_g = 20 if evaluator.task_success() else -5
                    r_g.append(reward_g)
                    t.append(1)
                else:
                    # 增加domain_success的奖励,其他的则每增加一轮减少一点损失,用于缩短轮数 todo 什么是 domain_success
                    reward_g = 5 if evaluator.cur_domain and evaluator.domain_success(
                        evaluator.cur_domain) else -1
                    r_g.append(reward_g)
                    t.append(0)

        torch.save((s_usr, s_sys, r_g, next_s_usr, next_s_sys, t), file_dir)
Пример #11
0
    def create_dataset_sys(self, part, file_dir, data_dir, cfg, db):
        """
        创建sys的训练数据
        """
        datas = self.data[part]
        goals = self.goal[part]
        # 系统状态+系统动作+回报+上一轮系统状态+末轮标志位
        s, a, r, next_s, t = [], [], [], [], []
        # evaluator 全称记录数据
        evaluator = MultiWozEvaluator(data_dir, cfg.d)
        for idx, turn_data in enumerate(datas):
            # user
            # 用户侧并没有做数据的更新操作
            if turn_data['others']['turn'] % 2 == 0:
                # 首轮对话加载用户目标
                if turn_data['others']['turn'] == 0:
                    evaluator.add_goal(
                        goals[turn_data['others']['session_id']])
                #
                evaluator.add_usr_da(turn_data['trg_user_action'])
                continue

            # 错位了,确实表示的下一轮状态
            if turn_data['others']['turn'] != 1:
                next_s.append(s[-1])

            # 将当前数据转化为状态向量
            s.append(torch.Tensor(state_vectorize(turn_data, cfg, db, True)))
            # 将当前动作转化为动作向量
            a.append(
                torch.Tensor(action_vectorize(turn_data['trg_sys_action'],
                                              cfg)))
            evaluator.add_sys_da(turn_data['trg_sys_action'])
            if turn_data['others']['terminal']:
                # 结束轮
                next_turn_data = deepcopy(turn_data)
                next_turn_data['others']['turn'] = -1
                next_turn_data['user_action'] = {}
                next_turn_data['sys_action'] = turn_data['trg_sys_action']
                next_turn_data['trg_sys_action'] = {}
                next_turn_data['belief_state'] = turn_data[
                    'final_belief_state']
                # 统计next_s
                next_s.append(
                    torch.Tensor(state_vectorize(next_turn_data, cfg, db,
                                                 True)))
                # 统计奖励, 对于系统动作,判决任务是否完成作为最终奖励依据,
                # 系统是否完成了真实用户动作所提出的订阅请求,且系统是否回答了真实用户动作所咨询的所有问题
                reward = 20 if evaluator.task_success(False) else -5
                r.append(reward)
                # 结束标志位
                t.append(1)
            else:
                reward = 0
                if evaluator.cur_domain:
                    for slot, value in turn_data['belief_state'][
                            evaluator.cur_domain].items():
                        if value == '?':
                            for da in turn_data['trg_sys_action']:
                                d, i, k, p = da.split('-')
                                if i in [
                                        'inform', 'recommend', 'offerbook',
                                        'offerbooked'
                                ] and k == slot:
                                    break
                            else:
                                # not answer request
                                # 没有完成对belief_state中的提问,奖励减一
                                reward -= 1
                if not turn_data['trg_sys_action']:
                    # 本轮没有回复奖励减五
                    reward -= 5
                r.append(reward)
                t.append(0)

        torch.save((s, a, r, next_s, t), file_dir)
Пример #12
0
    def evaluate(self):
        env = self.env_list[0]
        traj_len = 40
        reward_tot, turn_tot, inform_tot, match_tot, success_tot = [], [], [], [], []
        for seed in range(1000):
            s = env.reset(seed)
            print('seed', seed)
            print('goal', env.goal.domain_goals)
            print('usr', s['user_action'])
            turn = traj_len
            reward = []
            value = []
            mask = []
            for t in range(traj_len):
                s_vec = torch.Tensor(state_vectorize(s, env.cfg,
                                                     env.db)).to(device=DEVICE)
                # mode with policy during evaluation
                a = self.policy.select_action(s_vec, False)
                next_s, done = env.step(s, a.cpu())
                next_s_vec = torch.Tensor(
                    state_vectorize(next_s, env.cfg, env.db)).to(device=DEVICE)
                log_pi = self.policy.get_log_prob(s_vec, a)
                r = self.rewarder.estimate(s_vec, a, next_s_vec, log_pi)
                v = self.value(s_vec).squeeze(-1)
                reward.append(r.item())
                value.append(v.item())
                s = next_s
                print('sys', s['last_sys_action'])
                print('usr', s['user_action'])
                if done:
                    mask.append(0)
                    turn = t + 2  # one due to counting from 0, the one for the last turn
                    break
                mask.append(1)

            reward_tot.append(np.mean(reward))
            turn_tot.append(turn)
            match_tot += self.evaluator.match_rate(s)
            inform_tot.append(self.evaluator.inform_F1(s))
            reward = torch.Tensor(reward)
            value = torch.Tensor(value)
            mask = torch.LongTensor(mask)
            A_sa, v_target = self.est_adv(reward, value, mask)
            print('turn', turn)
            #print('reward', A_sa.tolist())
            print('reward', v_target[0].item())
            match_session = self.evaluator.match_rate(s, True)
            print('match', match_session)
            inform_session = self.evaluator.inform_F1(s, True)
            print('inform', inform_session)
            if (match_session == 1 and inform_session[1] == 1) \
            or (match_session == 1 and inform_session[1] is None) \
            or (match_session is None and inform_session[1] == 1):
                print('success', 1)
                success_tot.append(1)
            else:
                print('success', 0)
                success_tot.append(0)

        logging.info('reward {}'.format(np.mean(reward_tot)))
        logging.info('turn {}'.format(np.mean(turn_tot)))
        logging.info('match {}'.format(np.mean(match_tot)))
        TP, FP, FN = np.sum(inform_tot, 0)
        prec = TP / (TP + FP)
        rec = TP / (TP + FN)
        F1 = 2 * prec * rec / (prec + rec)
        logging.info('inform rec {}, F1 {}'.format(rec, F1))
        logging.info('success {}'.format(np.mean(success_tot)))