예제 #1
0
 def run(self):
     # 记录所有dones的位置
     start = 0
     frac = []
     reward_change = False
     for j in range(len(self.dones)):
         temp_frac = []
         for i in range(len(self.masks)):
             if self.masks[i][j]:
                 temp_frac.append((start, i))
                 start = i
         if self.dones[j]:
             temp_frac.append((start, len(self.masks)))
         frac.append(temp_frac)
     for i in range(len(frac)):
         for fr in frac[i]:
             st, ed = fr
             if ed > st:
                 # 在一个episode中随机取出一帧画面
                 rand = random.randint(st, ed - 1)
                 act_abs = self.actions[rand][i]
                 goal = feature_utils.extra_goal(act_abs, self.obs_nf[rand][i])
                 # 如果不是停留在原地
                 if goal != self.obs_nf[rand][i]['position']:
                     for j in range(rand+1, ed):
                         act_abs_next = self.actions[j][i]
                         goal_next = feature_utils.extra_goal(act_abs_next, self.obs_nf[j][i])
                         # 下一个目标和基础目标不同,跳出
                         if goal_next != goal:
                             break
                         # 开始无效移动,跳出
                         if self.obs_nf[j-1][i]['position'] == self.obs_nf[j][i]['position']:
                             break
                         self.rewards[j][i] += 0.05
                         reward_change = True
                         feature_utils.print_info('hindsight: to goal, +0.05', vb=True)
                         if self.obs_nf[j][i]['position'] == goal:
                             self.rewards[j][i] += 0.05
                             feature_utils.print_info('hindsight: arrive goal, +0.05', vb=True)
                             reward_change = True
                             break
     mb_advs = np.zeros_like(self.rewards)
     last_gae_lam = 0
     for step in reversed(range(self.n_steps)):
         if step == self.n_steps - 1:
             nextnonterminal = 1.0 - self.dones
             nextvalues = self.last_values
         else:
             nextnonterminal = 1.0 - self.masks[step + 1]
             nextvalues = self.values[step + 1]
         # ∆ = r + 𝛄 * v' - v
         delta = self.rewards[step] + self.gamma * nextvalues * nextnonterminal - self.values[step]
         # adv = ∆ + 𝛄 * lam * adv-pre
         mb_advs[step] = last_gae_lam = delta + self.gamma * self.lam * nextnonterminal * last_gae_lam
     mb_returns = mb_advs + self.values
     mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs = map(self.swap_and_flatten, (
         self.obs, mb_returns, self.masks, self.actions, self.values, self.neglogpaces))
     return mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs, reward_change
예제 #2
0
def _worker(remote, parent_remote, env_fn_wrapper):
    parent_remote.close()
    env = env_fn_wrapper.var()
    # TODO:记得设置训练智能体的 index
    train_idx = 0  # 设置训练的 agent 的 index
    teammates = [train_idx, (train_idx + 2) % 4]
    teammates.sort()
    enemies = [(train_idx + 1) % 4, (train_idx + 3) % 4]
    enemies.sort()
    while True:
        try:
            cmd, data = remote.recv()
            if cmd == 'step':
                whole_obs = env.get_observations()
                all_actions = env.act(whole_obs)  # 得到所有智能体的 actions
                # train_act = feature_utils._djikstra_act(whole_obs[train_idx], data, rang=8)
                all_actions[train_idx] = data  # 当前训练的 agent 的动作也加进来
                whole_obs, whole_rew, done, info = env.step(
                    all_actions)  # 得到所有 agent 的四元组
                rew = whole_rew[train_idx]  # 得到训练智能体的当前步的 reward
                win_rate = 0  # 输出胜率
                if done:  # 如果结束, 重新开一把
                    info[
                        'terminal_observation'] = whole_obs  # 保存终结的 observation,否则 reset 后将丢失
                    if info['result'] == constants.Result.Win:
                        win_rate = 1
                    # elif info['result'] == constants.Result.Loss:
                    #     win_rate = -1
                    whole_obs = env.reset()  # 重新开一把

                obs = feature_utils.featurize(whole_obs[train_idx])
                goal = feature_utils.extra_goal(data,
                                                whole_obs[train_idx],
                                                rang=8)  # 加入目标
                # print(goal)
                goal_map = np.zeros((8, 8))
                goal_map[goal] = 1
                goal_map = goal_map.reshape(1, 8, 8)
                obs = np.concatenate((obs, goal_map))

                remote.send((obs, rew, done, win_rate, whole_obs[train_idx]))

            elif cmd == 'reset':
                whole_obs = env.reset()
                obs = feature_utils.featurize(whole_obs[train_idx])
                goal_map = np.zeros((8, 8))
                goal_map[(1, 3)] = 1
                goal_map = goal_map.reshape(1, 8, 8)
                obs = np.concatenate((obs, goal_map))  # 初始obs添加空目标
                remote.send((obs, whole_obs[train_idx]))

            elif cmd == 'render':
                remote.send(env.render(*data[0], **data[1]))
            elif cmd == 'close':
                remote.close()
                break
            elif cmd == 'get_spaces':
                """增加前三行,注释最后一行。自定义 observation 和 action 的 space"""
                observation_space = feature_utils.get_observertion_space()
                action_space = feature_utils.get_action_space()
                remote.send((observation_space, action_space))
                # remote.send((env.observation_space, env.action_space))
            elif cmd == 'env_method':
                method = getattr(env, data[0])
                remote.send(method(*data[1], **data[2]))
            elif cmd == 'get_attr':
                remote.send(getattr(env, data))
            elif cmd == 'set_attr':
                remote.send(setattr(env, data[0], data[1]))
            else:
                raise NotImplementedError
        except EOFError:
            break
예제 #3
0
파일: play.py 프로젝트: tu2id4n/my_pmm
def _play():
    print('----------------------------------------------')
    print('|                  P L A Y                   |')
    print('----------------------------------------------')
    env_id = 'PommeRadioCompetition-v5'
    env = utils.make_env(env_id)

    model_type = 'ppo'
    vb = False
    pretrain = False
    model_path0 = 'models/test/v17_740k.zip'
    # model_path0 = 'models/pretrain_v1/pgn_e118.zip'
    model_path1 = None
    model_path2 = 'models/test/v17_740k.zip'
    model_path3 = None
    model_paths = [model_path0, model_path1, model_path2, model_path3]
    models = utils.get_load_models(model_type, model_paths)

    using_prune = False
    if using_prune:
        prune_agnets = [0, 1, 2, 3]  # 哪些智能体使用剪枝
        nokicks = [True, True, True, True]  # 调整是否使用kick
        print('prune_agents = ', prune_agnets)
        print('nokicks', nokicks)

    for episode in range(100):
        obs = env.reset()
        done = False
        prev2s = [(None, None)] * 4
        total_reward = 0
        while not done:
            all_actions = env.act(obs)

            # Use model
            for i in range(len(models)):
                if models[i] is not None:
                    feature_obs = feature_utils.featurize(obs[i])  # , env.position_trav)
                    if pretrain:
                        action, _states = models[i].predict(feature_obs)
                    else:
                        action_abs, _states = models[i].predict(feature_obs)
                        goal_abs = feature_utils.extra_goal(action_abs, obs[i])
                        utils.print_info('action_obs', action_abs, vb)
                        utils.print_info('goal_obs', goal_abs, vb)
                        # action = _djikstra_act(obs[i], action_abs)
                        action = action_abs
                    if type(action) == list:
                        action = action[0]
                    # print('action', action)
                    # print('model' + str(i) + ' action: ', action)
                    # if action == 3:
                    #     action = random.randint(0, 5)
                    all_actions[i] = action
                    # print_info('model' + str(i) + ' action: ', action)

            # Use prune
            if using_prune:
                for i in prune_agnets:
                    all_actions[i] = utils.get_modify_act(obs[i], all_actions[i]) #, prev2s[i], nokick=nokicks[i])
                    prev2s[i] = utils.get_prev2obs(prev2s[i], obs[i])

            # 修正为适应通信的动作
            # if args.env == 'PommeRadioCompetition-v2':
                # for i in range(len(all_actions)):
                #     all_actions[i] = [all_actions[i], 1, 1]
            obs, rewards, done, info = env.step(all_actions)
            env.render()
            total_reward += rewards[0]
            if not env._agents[0].is_alive:
                done = True
            # print(all_actions[0])
            # print('reward', rew)
            # print()
        print(info)
        print('total_reward', total_reward)
    env.close()
예제 #4
0
파일: play_8m8.py 프로젝트: tu2id4n/my_pmm
def _play():
    print('----------------------------------------------')
    print('|                  P L A Y                   |')
    print('----------------------------------------------')
    env_id = 'OneVsOne-v8'
    print('env = ', env_id)
    agent_list = [
        my_agents.StopAgent(),
        # my_agents.StopAgent(),
        my_agents.SimpleNoBombAgent(),
        # agents.SimpleAgent(),
        # agents.SimpleAgent(),
        # agents.SimpleAgent(),
        # hit18Agent('1'),
        # hit18Agent('3')
    ]
    env = pommerman.make(env_id, agent_list)
    # env = utils.make_env(env_id)

    vb = True
    model_path = 'models/dqn/kl_135k.zip'
    model = DQN.load(load_path=model_path)

    win = 0
    tie = 0
    loss = 0
    for episode in tqdm(range(100)):
        obs = env.reset()
        done = False
        total_reward = 0
        action_abs = 65
        while not done:
            all_actions = env.act(obs)
            # 规格化输入
            feature_obs = feature_utils.featurize(obs[0])
            if action_abs == 65:
                goal_map = np.zeros((8, 8))
                goal_map[(1, 3)] = 1

            else:
                goal = feature_utils.extra_goal(action_abs, obs[0],
                                                rang=8)  # 加入目标
                goal_map = np.zeros((8, 8))
                goal_map[goal] = 1
            goal_map = goal_map.reshape(1, 8, 8)
            feature_obs = np.concatenate((feature_obs, goal_map))
            feature_obs = feature_obs.transpose((1, 2, 0))
            # 模型预测
            action_abs, _states = model.predict(feature_obs)
            # goal = feature_utils.extra_goal(action_abs, obs[0], rang=8)
            # print(obs[0])
            # action = feature_utils._djikstra_act(obs[0], action_abs, rang=8)
            # action = feature_utils._djikstra_act_8m8(obs_nf=obs[0], goal_abs=action_abs)

            if type(action_abs) == list:
                action_abs = action_abs[0]

            all_actions[0] = action_abs
            # print('act_abs', all_actions[0])
            obs, rewards, done, info = env.step(all_actions)
            # print('reward', rewards[0])
            # print()
            env.render()
            total_reward += rewards[0]
            # if not env._agents[0].is_alive:
            #     done = True
        # print(info)
        # print('total_reward', total_reward)
        print(info['result'])
        print()
        if info['result'] == constants.Result.Win:
            win += 1
        elif info['result'] == constants.Result.Tie:
            tie += 1
        elif info['result'] == constants.Result.Loss:
            loss += 1
    env.close()
    print('Win rate:', win / 100)
    print('Tie rate:', tie / 100)
    print('Loss rate:', loss / 100)
예제 #5
0
def get_rewards_8m8(agents,
                    step_count,
                    max_steps,
                    whole_obs_pre,
                    whole_obs,
                    act_abs_pres,
                    idx=0):
    # print_info('-------------------| Agent', str(idx) +' |-------------------')
    # print(act_abs_pres)
    print_info(' ', ' ')

    def any_lst_equal(lst, values):
        """Checks if list are equal"""
        return any([lst == v for v in values])

    alive_agents = [num for num, agent in enumerate(agents) \
                    if agent.is_alive]

    obs_pre = copy.deepcopy(whole_obs_pre[idx])
    obs_now = copy.deepcopy(whole_obs[idx])
    act_abs_pre = act_abs_pres
    position_pre = obs_pre['position']
    position_now = obs_now['position']

    bomb_life_now = feature_utils.get_bomb_life(obs_now, rang=8)
    bomb_life_pre = feature_utils.get_bomb_life(obs_pre, rang=8)
    my_bomb_life_now = feature_utils.get_my_bomb_life(bomb_life_now,
                                                      position_now,
                                                      rang=8)

    extrabomb = constants.Item.ExtraBomb.value
    kick = constants.Item.Kick.value
    incrrange = constants.Item.IncrRange.value
    bomb = constants.Item.Bomb.value
    wood = constants.Item.Wood.value
    agent1 = constants.Item.Agent1.value
    agent3 = constants.Item.Agent3.value
    teamate = obs_pre['teammate'].value

    reward = 0

    # 自己被炸死
    # if 0 < bomb_life_now[position_now] < 4:
    #     reward -= 1
    #     print_info('You dead', '-1')

    act_pre = feature_utils._djikstra_act(obs_pre, act_abs_pre,
                                          rang=8)  # 这里只用来判断
    goal_pre = feature_utils.extra_goal(act_abs_pre, obs_pre, rang=8)
    # 如果是放bomb
    if act_pre == 5:
        # 没有ammo放bomb
        if obs_pre['ammo'] == 0:
            reward -= 0.1
            print_info('No ammo', '-0.1')
        # 如果有ammo
        else:
            nothing = True
            # 放的bomb可以波及到wood/enemy
            for r in range(8):
                for c in range(8):
                    if my_bomb_life_now[(r, c)] > 0:
                        if obs_pre['board'][(r, c)] in [wood]:
                            reward += 0.2
                            nothing = False
                            print_info('bomb -> wood', '+0.2')
                        if obs_pre['board'][(r, c)] in [agent1, agent3]:
                            reward += 0.3
                            nothing = False
                            print_info('bomb -> enemy', '+0.3')
                            # print(obs_pre['board'][(r, c)])
                        if obs_pre['board'][(r, c)] in [
                                incrrange, extrabomb, kick
                        ]:
                            reward -= 0.05
                            print_info('bomb -> powerup', '-0.05')
                        if obs_pre['board'][(r, c)] in [teamate]:
                            reward -= 0.05
                            print_info('bomb -> teammate', '-0.05')
            # if nothing:
            #     reward -= 0.1
            #     # print(obs_pre['board'][(r, c)])
            #     print_info('Useless bomb', '-0.1')
    # 没有动
    elif act_pre == 0:
        if obs_pre['position'] != goal_pre:
            print_info('obs_pre', obs_pre['position'])
            print_info('goal_pre', goal_pre)
            reward -= 0.1
            # print(goal_pre)
            print_info('Faultal goal', '-0.1')
    # 如果是移动
    else:
        # 踢炸弹获得奖励
        if position_now != position_pre:
            if obs_pre['can_kick']:
                if obs_pre['board'][goal_pre] == bomb:
                    reward += 0.01
                    print_info('Want to kick', '+0.01')
                if obs_pre['board'][position_now] == bomb:
                    reward += 0.2
                    print_info('Kick', '+0.2')
            # 从安全位置进入到被炸弹波及之中
            if bomb_life_pre[
                    position_pre] == 0 and bomb_life_now[position_now] > 0:
                reward -= 0.15
                print_info('Enter the explosion range', '-0.15')
            # 被炸弹波及但是在向安全的位置移动
            if bomb_life_pre[position_pre] > 0 and bomb_life_pre[goal_pre] == 0:
                reward += 0.05
                print_info('Escape from the explosin range ', '+0.05')
            # 向着items移动
            if obs_pre['board'][goal_pre] in [extrabomb, kick, incrrange]:
                reward += 0.01
                print_info('Want a Item', '+0.01')
                # 吃到items
                if obs_pre['board'][position_now] in [
                        extrabomb, kick, incrrange
                ]:
                    reward += 0.1
                    print_info('Eat a Item', '+0.1')
            # 吃到items
            elif obs_pre['board'][position_now] in [
                    extrabomb, kick, incrrange
            ]:
                reward += 0.05
                print_info('Passing by a Item', '+0.05')

    if len(alive_agents) == 1:
        # An agent won. Give them +1, the other -1.
        print_info('Game Over', int(agents[0].is_alive))
        return [reward + 2 * int(agent.is_alive) - 1 for agent in agents]
    elif step_count >= max_steps:
        # Game is over from time. Everyone gets -1.
        return [reward - 1] * 2
    else:
        # Game running
        return [reward] * 2
예제 #6
0
def get_rewards_v3_8(agents, step_count, max_steps, whole_obs_pre, whole_obs,
                     act_abs_pres, idx):
    # print_info('-------------------| Agent', str(idx) +' |-------------------')
    # print(act_abs_pres)
    def any_lst_equal(lst, values):
        """Checks if list are equal"""
        return any([lst == v for v in values])

    alive_agents = [num for num, agent in enumerate(agents) \
                    if agent.is_alive]

    obs_pre = copy.deepcopy(whole_obs_pre[idx])
    obs_now = copy.deepcopy(whole_obs[idx])
    act_abs_pre = act_abs_pres[idx]
    position_pre = obs_pre['position']
    position_now = obs_now['position']

    bomb_life_now = feature_utils.get_bomb_life(obs_now)
    bomb_life_pre = feature_utils.get_bomb_life(obs_pre)
    my_bomb_life_now = feature_utils.get_my_bomb_life(bomb_life_now,
                                                      position_now)

    extrabomb = constants.Item.ExtraBomb.value
    kick = constants.Item.Kick.value
    incrrange = constants.Item.IncrRange.value
    bomb = constants.Item.Bomb.value
    wood = constants.Item.Wood.value
    agent1 = constants.Item.Agent1.value
    agent3 = constants.Item.Agent3.value
    teamate = obs_pre['teammate'].value

    reward = 0

    # 自己被炸死
    if 0 < bomb_life_now[position_now] < 4:
        reward -= 1
        print_info('You dead', '-1')

    act_pre = feature_utils._djikstra_act(obs_pre, act_abs_pre)  # 这里只用来判断
    goal_pre = feature_utils.extra_goal(act_abs_pre, obs_pre)
    # 如果是放bomb
    if act_pre == 5:
        # 没有ammo放bomb
        if obs_pre['ammo'] == 0:
            reward -= 0.1
            print_info('No ammo', '-0.1')
        # 如果有ammo
        else:
            nothing = True
            # 放的bomb可以波及到wood/enemy
            for r in range(11):
                for c in range(11):
                    if my_bomb_life_now[(r, c)] > 0:
                        if obs_pre['board'][(r, c)] in [wood]:
                            reward += 0.2
                            nothing = False
                            print_info('bomb -> wood', '+0.2')
                        if obs_pre['board'][(r, c)] in [agent1, agent3]:
                            reward += 0.3
                            nothing = False
                            print_info('bomb -> enemy', '+0.3')
                        if obs_pre['board'][(r, c)] in [
                                incrrange, extrabomb, kick
                        ]:
                            reward -= 0.05
                            print_info('bomb -> powerup', '-0.05')
                        if obs_pre['board'][(r, c)] in [teamate]:
                            reward -= 0.05
                            print_info('bomb -> teammate', '-0.05')
            if nothing:
                reward -= 0.1
                print_info('Useless bomb', '-0.1')
    # 没有动
    elif act_pre == 0:
        if obs_pre['position'] != goal_pre:
            reward -= 0.1
            print_info('Faultal goal', '-0.1')
    # 如果是移动
    else:
        # 踢炸弹获得奖励
        if position_now != position_pre:
            if obs_pre['can_kick']:
                if obs_pre['board'][goal_pre] == bomb:
                    reward += 0.01
                    print_info('Want to kick', '+0.01')
                if obs_pre['board'][position_now] == bomb:
                    reward += 0.2
                    print_info('Kick', '+0.2')
            # 从安全位置进入到被炸弹波及之中
            if bomb_life_pre[
                    position_pre] == 0 and bomb_life_now[position_now] > 0:
                reward -= 0.15
                print_info('Enter the explosion range', '-0.15')
            # 被炸弹波及但是在向安全的位置移动
            if bomb_life_pre[position_pre] > 0 and bomb_life_pre[goal_pre] == 0:
                reward += 0.05
                print_info('Escape from the explosin range ', '+0.05')
            # 向着items移动
            if obs_pre['board'][goal_pre] in [extrabomb, kick, incrrange]:
                reward += 0.01
                print_info('Want a Item', '+0.01')
                # 吃到items
                if obs_pre['board'][position_now] in [
                        extrabomb, kick, incrrange
                ]:
                    reward += 0.1
                    print_info('Eat a Item', '+0.1')
            # 吃到items
            elif obs_pre['board'][position_now] in [
                    extrabomb, kick, incrrange
            ]:
                reward += 0.05
                print_info('Passing by a Item', '+0.05')

    # We are playing a team game.
    if any_lst_equal(alive_agents, [[0, 2], [0], [2]]):
        # Team [0, 2] wins.
        print_info('Wins and agent0 alive.', reward + 1)
        return reward + 1
    elif any_lst_equal(alive_agents, [[1, 3]]):
        # Team [1, 3] wins and no enemy dead.
        print_info('Loss and no enemy dead.', reward - 1)
        return reward - 1
    elif any_lst_equal(alive_agents, [[1], [3]]):
        # Team [1, 3] wins and one enemy dead.
        print_info('Loss and one enemy dead.', reward - 0.6)
        return reward - 0.6
    elif step_count >= max_steps and any_lst_equal(
            alive_agents, [[0, 1], [0, 1, 2], [0, 3], [0, 2, 3]]):
        # tie and one enemy dead.
        print_info('Tie and one enemy dead.', reward - 0.6)
        return reward - 0.6
    elif step_count >= max_steps:
        # Game is over by max_steps. All agents tie.
        print_info('Game is over by max_steps. All agents tie.', reward - 1)
        return reward - 1
    elif len(alive_agents) == 0:
        # Everyone's dead. All agents tie.
        print_info('Everyone is dead. All agents tie.', reward)
        return reward
    else:
        # No team has yet won or lost.
        return reward
예제 #7
0
def get_rewards_v3_7(agents, step_count, max_steps, whole_obs_pre, whole_obs,
                     act_abs_pre):
    def any_lst_equal(lst, values):
        """Checks if list are equal"""
        return any([lst == v for v in values])

    alive_agents = [num for num, agent in enumerate(agents) \
                    if agent.is_alive]

    obs_pre = copy.deepcopy(whole_obs_pre[0])
    obs_now = copy.deepcopy(whole_obs[0])

    position_pre = obs_pre['position']
    position_now = obs_now['position']

    bomb_life_now = feature_utils.get_bomb_life(obs_now)
    bomb_life_pre = feature_utils.get_bomb_life(obs_pre)
    my_bomb_life_now = feature_utils.get_my_bomb_life(bomb_life_now,
                                                      position_now)

    extrabomb = constants.Item.ExtraBomb.value
    kick = constants.Item.Kick.value
    incrrange = constants.Item.IncrRange.value
    bomb = constants.Item.Bomb.value
    wood = constants.Item.Wood.value
    agent1 = constants.Item.Agent1.value
    agent3 = constants.Item.Agent3.value
    agent2 = constants.Item.Agent2.value
    e11_pre = feature_utils.extra_position(11, obs_pre['board'])
    e13_pre = feature_utils.extra_position(13, obs_pre['board'])
    e11_now = feature_utils.extra_position(11, obs_now['board'])
    e13_now = feature_utils.extra_position(13, obs_now['board'])

    reward = 0
    # # 敌人被炸死
    # if e11 is not None and 0 < bomb_life[e11] < 4:
    #     reward += 0.5
    #     print_info('e11被炸死', '+0.5')
    # if e13 is not None and 0 < bomb_life[e13] < 4:
    #     reward += 0.5
    #     print_info('e13被炸死', '+0.5')

    # 敌人从视野中消失:
    # if e11_now is None and e11_pre is not None:
    #     reward -= 0.02
    #     print_info('敌人e11消失', '-0.01')
    # if e13_now is None and e13_pre is not None:
    #     reward -= 0.02
    #     print_info('敌人e13消失', '-0.01')
    # if e11_pre is None and e11_now is not None:
    #     reward += 0.01
    #     print_info('敌人e11出现', '+0.01')
    # if e13_pre is None and e13_now is not None:
    #     reward += 0.01
    #     print_info('敌人e13出现', '+0.01')

    # 自己被炸死
    if 0 < bomb_life_now[position_now] < 4:
        reward -= 1
        print_info('You dead', '-1')

    act_pre = feature_utils._djikstra_act(obs_pre, act_abs_pre)  # 这里只用来判断
    goal_pre = feature_utils.extra_goal(act_abs_pre, obs_pre)
    # 如果是放bomb
    if act_pre == 5:
        # 没有ammo放bomb
        if obs_pre['ammo'] == 0:
            reward -= 0.1
            print_info('No ammo', '-0.1')
        # 如果有ammo
        else:
            nothing = True
            # 放的bomb可以波及到wood/enemy
            for r in range(11):
                for c in range(11):
                    if my_bomb_life_now[(r, c)] > 0:
                        if obs_pre['board'][(r, c)] in [wood]:
                            reward += 0.2
                            nothing = False
                            print_info('bomb -> wood', '+0.2')
                        if obs_pre['board'][(r, c)] in [agent1, agent3]:
                            reward += 0.3
                            nothing = False
                            print_info('bomb -> enemy', '+0.3')
                        if obs_pre['board'][(r, c)] in [
                                incrrange, extrabomb, kick
                        ]:
                            reward -= 0.05
                            print_info('bomb -> powerup', '-0.05')
                        if obs_pre['board'][(r, c)] in [agent2]:
                            reward -= 0.05
                            print_info('bomb -> teammate', '-0.05')
            if nothing:
                reward -= 0.1
                print_info('Useless bomb', '-0.1')
    # 没有动
    elif act_pre == 0:
        if obs_pre['position'] != goal_pre:
            reward -= 0.1
            print_info('Faultal goal', '-0.1')
    # 如果是移动
    else:
        # r_pre, c_pre = position_pre
        # r_now, c_now = position_now
        # r_to = r_now - r_pre
        # c_to = c_now - c_pre
        # if (r_to, c_to) == (-1, 0): act_pre = 1
        # if (r_to, c_to) == (1, 0): act_pre = 2
        # if (r_to, c_to) == (0, -1): act_pre = 3
        # if (r_to, c_to) == (0, 1): act_pre = 4
        # 有效的移动
        # reward += 0.001
        # print_info('有效的移动', '+0.001')
        # 踢炸弹获得奖励
        if position_now != position_pre:
            if obs_pre['can_kick']:
                if obs_pre['board'][goal_pre] == bomb:
                    reward += 0.01
                    print_info('Want to kick', '+0.01')
                if obs_pre['board'][position_now] == bomb:
                    reward += 0.2
                    print_info('Kick', '+0.2')
            # 从安全位置进入到被炸弹波及之中
            if bomb_life_pre[
                    position_pre] == 0 and bomb_life_now[position_now] > 0:
                reward -= 0.15
                print_info('Enter the explosion range', '-0.15')
            # 被炸弹波及但是在向安全的位置移动
            if bomb_life_pre[position_pre] > 0 and bomb_life_pre[goal_pre] == 0:
                reward += 0.05
                print_info('Escape from the explosin range ', '+0.05')
            # 向着items移动
            if obs_pre['board'][goal_pre] in [extrabomb, kick, incrrange]:
                reward += 0.01
                print_info('Want a Item', '+0.01')
                # 吃到items
                if obs_pre['board'][position_now] in [
                        extrabomb, kick, incrrange
                ]:
                    reward += 0.1
                    print_info('Eat a Item', '+0.1')
            # 吃到items
            elif obs_pre['board'][position_now] in [
                    extrabomb, kick, incrrange
            ]:
                reward += 0.05
                print_info('Passing by a Item', '+0.05')

    # We are playing a team game.
    if any_lst_equal(alive_agents, [[0, 2], [0], [2]]):
        # Team [0, 2] wins.
        print_info('Wins and agent0 alive.', reward + 1)
        return [reward + 1, -1, 1, -1]
    elif any_lst_equal(alive_agents, [[1, 3]]):
        # Team [1, 3] wins and no enemy dead.
        print_info('Loss and no enemy dead.', reward - 1)
        return [reward - 1, 1, -1, 1]
    elif any_lst_equal(alive_agents, [[1], [3]]):
        # Team [1, 3] wins and one enemy dead.
        print_info('Loss and one enemy dead.', reward - 0.6)
        return [reward - 0.6, 1, -1, 1]
    elif step_count >= max_steps and any_lst_equal(
            alive_agents, [[0, 1], [0, 1, 2], [0, 3], [0, 2, 3]]):
        # tie and one enemy dead.
        print_info('Tie and one enemy dead.', reward - 0.6)
        return [reward - 0.6, 1, -1, 1]
    elif step_count >= max_steps:
        # Game is over by max_steps. All agents tie.
        print_info('Game is over by max_steps. All agents tie.', reward - 1)
        return [reward - 1] * 4
    elif len(alive_agents) == 0:
        # Everyone's dead. All agents tie.
        print_info('Everyone is dead. All agents tie.', reward)
        return [reward] * 4
    else:
        # No team has yet won or lost.
        return [reward] * 4
예제 #8
0
def get_rewards_v3_6(agents, step_count, max_steps, whole_obs_pre, whole_obs,
                     act_abs_pre):
    def any_lst_equal(lst, values):
        """Checks if list are equal"""
        return any([lst == v for v in values])

    alive_agents = [num for num, agent in enumerate(agents) \
                    if agent.is_alive]

    obs_pre = copy.deepcopy(whole_obs_pre[0])
    obs_now = copy.deepcopy(whole_obs[0])

    position_pre = obs_pre['position']
    position_now = obs_now['position']

    bomb_life_now = feature_utils.get_bomb_life(obs_now)
    bomb_life_pre = feature_utils.get_bomb_life(obs_pre)
    my_bomb_life_now = feature_utils.get_my_bomb_life(bomb_life_now,
                                                      position_now)

    extrabomb = constants.Item.ExtraBomb.value
    kick = constants.Item.Kick.value
    incrrange = constants.Item.IncrRange.value
    wood = constants.Item.Wood.value
    agent1 = constants.Item.Agent1.value
    agent3 = constants.Item.Agent3.value
    e11_now = feature_utils.extra_position(11, obs_now['board'])
    e13_now = feature_utils.extra_position(13, obs_now['board'])

    reward = 0
    # # 敌人被炸死
    # if e11 is not None and 0 < bomb_life[e11] < 4:
    #     reward += 0.5
    #     print_info('e11被炸死', '+0.5')
    # if e13 is not None and 0 < bomb_life[e13] < 4:
    #     reward += 0.5
    #     print_info('e13被炸死', '+0.5')

    # 自己被炸死
    if 0 < bomb_life_now[position_now] < 4:
        reward -= 0.5
        print_info('自己被炸死', '-0.5')

    act_pre = feature_utils._djikstra_act(obs_pre, act_abs_pre)  # 这里只用来判断
    goal_pre = feature_utils.extra_goal(act_abs_pre, obs_pre)
    # 如果是放bomb
    if act_pre == 5:
        # 没有ammo放bomb
        if obs_pre['ammo'] == 0:
            reward -= 0.1
            print_info('没有ammo放炸弹', '-0.1')
        # 如果有ammo
        else:
            # 放的bomb可以波及到wood/enemy
            for r in range(11):
                for c in range(11):
                    if my_bomb_life_now[(r, c)] > 0:
                        if obs_pre['board'][(r, c)] in [wood]:
                            reward += 0.2
                            print_info('炸弹波及到wood', '+0.2')
                        if obs_pre['board'][(r, c)] in [agent1, agent3]:
                            reward += 0.3
                            print_info('炸弹波及到敌人', '+0.3')
    # 没有动
    elif act_pre == 0:
        if obs_pre['position'] != goal_pre:
            reward -= 0.01
            print_info('无效移动', '-0.01')
    # 如果是移动
    else:
        # 有效的移动
        # reward += 0.001
        # print_info('有效的移动', '+0.001')
        # 被炸弹波及但是在向安全的位置移动
        if bomb_life_pre[position_pre] > 0 and bomb_life_pre[goal_pre] == 0:
            reward += 0.05
            print_info('被炸弹波及向着安全的位置移动', '+0.05')
        # 向着items移动
        if obs_pre['board'][goal_pre] in [extrabomb, kick, incrrange]:
            reward += 0.01
            print_info('向items移动', '+0.01')
            # 吃到items
            if obs_pre['board'][position_now] in [extrabomb, kick, incrrange]:
                reward += 0.3
                print_info('向着item移动并吃到items', '+0.2')
        # 吃到items
        elif obs_pre['board'][position_now] in [extrabomb, kick, incrrange]:
            reward += 0.05
            print_info('路过吃到items', '+0.05')

    # We are playing a team game.
    if any_lst_equal(alive_agents, [[0, 2], [0], [2]]):
        # Team [0, 2] wins.
        print_info('Team [0, 2] wins and agent0 alive.', reward + 1)
        return [reward + 1, -1, 1, -1]
    elif any_lst_equal(alive_agents, [[1, 3]]):
        # Team [1, 3] wins and no enemy dead.
        print_info('Team [1, 3] wins and no enemy dead.', reward - 1)
        return [reward - 1, 1, -1, 1]
    elif any_lst_equal(alive_agents, [[1], [3]]):
        # Team [1, 3] wins and one enemy dead.
        print_info('Team [1, 3] wins and one enemy dead.', reward + 0.5)
        return [reward + 0.5, 1, -1, 1]
    elif step_count >= max_steps and any_lst_equal(
            alive_agents, [[0, 1], [0, 1, 2], [0, 3], [0, 2, 3]]):
        # tie and one enemy dead.
        print_info('tie and one enemy dead.', reward + 0.5)
        return [reward + 0.5, 1, -1, 1]
    elif step_count >= max_steps:
        # Game is over by max_steps. All agents tie.
        print_info('Game is over by max_steps. All agents tie.', reward - 1)
        return [reward - 1] * 4
    elif len(alive_agents) == 0:
        # Everyone's dead. All agents tie.
        print_info('Everyone is dead. All agents tie.', reward + 0.5)
        return [reward + 0.5] * 4
    else:
        # No team has yet won or lost.
        return [reward] * 4