def run(self): # 记录所有dones的位置 start = 0 frac = [] reward_change = False for j in range(len(self.dones)): temp_frac = [] for i in range(len(self.masks)): if self.masks[i][j]: temp_frac.append((start, i)) start = i if self.dones[j]: temp_frac.append((start, len(self.masks))) frac.append(temp_frac) for i in range(len(frac)): for fr in frac[i]: st, ed = fr if ed > st: # 在一个episode中随机取出一帧画面 rand = random.randint(st, ed - 1) act_abs = self.actions[rand][i] goal = feature_utils.extra_goal(act_abs, self.obs_nf[rand][i]) # 如果不是停留在原地 if goal != self.obs_nf[rand][i]['position']: for j in range(rand+1, ed): act_abs_next = self.actions[j][i] goal_next = feature_utils.extra_goal(act_abs_next, self.obs_nf[j][i]) # 下一个目标和基础目标不同,跳出 if goal_next != goal: break # 开始无效移动,跳出 if self.obs_nf[j-1][i]['position'] == self.obs_nf[j][i]['position']: break self.rewards[j][i] += 0.05 reward_change = True feature_utils.print_info('hindsight: to goal, +0.05', vb=True) if self.obs_nf[j][i]['position'] == goal: self.rewards[j][i] += 0.05 feature_utils.print_info('hindsight: arrive goal, +0.05', vb=True) reward_change = True break mb_advs = np.zeros_like(self.rewards) last_gae_lam = 0 for step in reversed(range(self.n_steps)): if step == self.n_steps - 1: nextnonterminal = 1.0 - self.dones nextvalues = self.last_values else: nextnonterminal = 1.0 - self.masks[step + 1] nextvalues = self.values[step + 1] # ∆ = r + 𝛄 * v' - v delta = self.rewards[step] + self.gamma * nextvalues * nextnonterminal - self.values[step] # adv = ∆ + 𝛄 * lam * adv-pre mb_advs[step] = last_gae_lam = delta + self.gamma * self.lam * nextnonterminal * last_gae_lam mb_returns = mb_advs + self.values mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs = map(self.swap_and_flatten, ( self.obs, mb_returns, self.masks, self.actions, self.values, self.neglogpaces)) return mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs, reward_change
def _worker(remote, parent_remote, env_fn_wrapper): parent_remote.close() env = env_fn_wrapper.var() # TODO:记得设置训练智能体的 index train_idx = 0 # 设置训练的 agent 的 index teammates = [train_idx, (train_idx + 2) % 4] teammates.sort() enemies = [(train_idx + 1) % 4, (train_idx + 3) % 4] enemies.sort() while True: try: cmd, data = remote.recv() if cmd == 'step': whole_obs = env.get_observations() all_actions = env.act(whole_obs) # 得到所有智能体的 actions # train_act = feature_utils._djikstra_act(whole_obs[train_idx], data, rang=8) all_actions[train_idx] = data # 当前训练的 agent 的动作也加进来 whole_obs, whole_rew, done, info = env.step( all_actions) # 得到所有 agent 的四元组 rew = whole_rew[train_idx] # 得到训练智能体的当前步的 reward win_rate = 0 # 输出胜率 if done: # 如果结束, 重新开一把 info[ 'terminal_observation'] = whole_obs # 保存终结的 observation,否则 reset 后将丢失 if info['result'] == constants.Result.Win: win_rate = 1 # elif info['result'] == constants.Result.Loss: # win_rate = -1 whole_obs = env.reset() # 重新开一把 obs = feature_utils.featurize(whole_obs[train_idx]) goal = feature_utils.extra_goal(data, whole_obs[train_idx], rang=8) # 加入目标 # print(goal) goal_map = np.zeros((8, 8)) goal_map[goal] = 1 goal_map = goal_map.reshape(1, 8, 8) obs = np.concatenate((obs, goal_map)) remote.send((obs, rew, done, win_rate, whole_obs[train_idx])) elif cmd == 'reset': whole_obs = env.reset() obs = feature_utils.featurize(whole_obs[train_idx]) goal_map = np.zeros((8, 8)) goal_map[(1, 3)] = 1 goal_map = goal_map.reshape(1, 8, 8) obs = np.concatenate((obs, goal_map)) # 初始obs添加空目标 remote.send((obs, whole_obs[train_idx])) elif cmd == 'render': remote.send(env.render(*data[0], **data[1])) elif cmd == 'close': remote.close() break elif cmd == 'get_spaces': """增加前三行,注释最后一行。自定义 observation 和 action 的 space""" observation_space = feature_utils.get_observertion_space() action_space = feature_utils.get_action_space() remote.send((observation_space, action_space)) # remote.send((env.observation_space, env.action_space)) elif cmd == 'env_method': method = getattr(env, data[0]) remote.send(method(*data[1], **data[2])) elif cmd == 'get_attr': remote.send(getattr(env, data)) elif cmd == 'set_attr': remote.send(setattr(env, data[0], data[1])) else: raise NotImplementedError except EOFError: break
def _play(): print('----------------------------------------------') print('| P L A Y |') print('----------------------------------------------') env_id = 'PommeRadioCompetition-v5' env = utils.make_env(env_id) model_type = 'ppo' vb = False pretrain = False model_path0 = 'models/test/v17_740k.zip' # model_path0 = 'models/pretrain_v1/pgn_e118.zip' model_path1 = None model_path2 = 'models/test/v17_740k.zip' model_path3 = None model_paths = [model_path0, model_path1, model_path2, model_path3] models = utils.get_load_models(model_type, model_paths) using_prune = False if using_prune: prune_agnets = [0, 1, 2, 3] # 哪些智能体使用剪枝 nokicks = [True, True, True, True] # 调整是否使用kick print('prune_agents = ', prune_agnets) print('nokicks', nokicks) for episode in range(100): obs = env.reset() done = False prev2s = [(None, None)] * 4 total_reward = 0 while not done: all_actions = env.act(obs) # Use model for i in range(len(models)): if models[i] is not None: feature_obs = feature_utils.featurize(obs[i]) # , env.position_trav) if pretrain: action, _states = models[i].predict(feature_obs) else: action_abs, _states = models[i].predict(feature_obs) goal_abs = feature_utils.extra_goal(action_abs, obs[i]) utils.print_info('action_obs', action_abs, vb) utils.print_info('goal_obs', goal_abs, vb) # action = _djikstra_act(obs[i], action_abs) action = action_abs if type(action) == list: action = action[0] # print('action', action) # print('model' + str(i) + ' action: ', action) # if action == 3: # action = random.randint(0, 5) all_actions[i] = action # print_info('model' + str(i) + ' action: ', action) # Use prune if using_prune: for i in prune_agnets: all_actions[i] = utils.get_modify_act(obs[i], all_actions[i]) #, prev2s[i], nokick=nokicks[i]) prev2s[i] = utils.get_prev2obs(prev2s[i], obs[i]) # 修正为适应通信的动作 # if args.env == 'PommeRadioCompetition-v2': # for i in range(len(all_actions)): # all_actions[i] = [all_actions[i], 1, 1] obs, rewards, done, info = env.step(all_actions) env.render() total_reward += rewards[0] if not env._agents[0].is_alive: done = True # print(all_actions[0]) # print('reward', rew) # print() print(info) print('total_reward', total_reward) env.close()
def _play(): print('----------------------------------------------') print('| P L A Y |') print('----------------------------------------------') env_id = 'OneVsOne-v8' print('env = ', env_id) agent_list = [ my_agents.StopAgent(), # my_agents.StopAgent(), my_agents.SimpleNoBombAgent(), # agents.SimpleAgent(), # agents.SimpleAgent(), # agents.SimpleAgent(), # hit18Agent('1'), # hit18Agent('3') ] env = pommerman.make(env_id, agent_list) # env = utils.make_env(env_id) vb = True model_path = 'models/dqn/kl_135k.zip' model = DQN.load(load_path=model_path) win = 0 tie = 0 loss = 0 for episode in tqdm(range(100)): obs = env.reset() done = False total_reward = 0 action_abs = 65 while not done: all_actions = env.act(obs) # 规格化输入 feature_obs = feature_utils.featurize(obs[0]) if action_abs == 65: goal_map = np.zeros((8, 8)) goal_map[(1, 3)] = 1 else: goal = feature_utils.extra_goal(action_abs, obs[0], rang=8) # 加入目标 goal_map = np.zeros((8, 8)) goal_map[goal] = 1 goal_map = goal_map.reshape(1, 8, 8) feature_obs = np.concatenate((feature_obs, goal_map)) feature_obs = feature_obs.transpose((1, 2, 0)) # 模型预测 action_abs, _states = model.predict(feature_obs) # goal = feature_utils.extra_goal(action_abs, obs[0], rang=8) # print(obs[0]) # action = feature_utils._djikstra_act(obs[0], action_abs, rang=8) # action = feature_utils._djikstra_act_8m8(obs_nf=obs[0], goal_abs=action_abs) if type(action_abs) == list: action_abs = action_abs[0] all_actions[0] = action_abs # print('act_abs', all_actions[0]) obs, rewards, done, info = env.step(all_actions) # print('reward', rewards[0]) # print() env.render() total_reward += rewards[0] # if not env._agents[0].is_alive: # done = True # print(info) # print('total_reward', total_reward) print(info['result']) print() if info['result'] == constants.Result.Win: win += 1 elif info['result'] == constants.Result.Tie: tie += 1 elif info['result'] == constants.Result.Loss: loss += 1 env.close() print('Win rate:', win / 100) print('Tie rate:', tie / 100) print('Loss rate:', loss / 100)
def get_rewards_8m8(agents, step_count, max_steps, whole_obs_pre, whole_obs, act_abs_pres, idx=0): # print_info('-------------------| Agent', str(idx) +' |-------------------') # print(act_abs_pres) print_info(' ', ' ') def any_lst_equal(lst, values): """Checks if list are equal""" return any([lst == v for v in values]) alive_agents = [num for num, agent in enumerate(agents) \ if agent.is_alive] obs_pre = copy.deepcopy(whole_obs_pre[idx]) obs_now = copy.deepcopy(whole_obs[idx]) act_abs_pre = act_abs_pres position_pre = obs_pre['position'] position_now = obs_now['position'] bomb_life_now = feature_utils.get_bomb_life(obs_now, rang=8) bomb_life_pre = feature_utils.get_bomb_life(obs_pre, rang=8) my_bomb_life_now = feature_utils.get_my_bomb_life(bomb_life_now, position_now, rang=8) extrabomb = constants.Item.ExtraBomb.value kick = constants.Item.Kick.value incrrange = constants.Item.IncrRange.value bomb = constants.Item.Bomb.value wood = constants.Item.Wood.value agent1 = constants.Item.Agent1.value agent3 = constants.Item.Agent3.value teamate = obs_pre['teammate'].value reward = 0 # 自己被炸死 # if 0 < bomb_life_now[position_now] < 4: # reward -= 1 # print_info('You dead', '-1') act_pre = feature_utils._djikstra_act(obs_pre, act_abs_pre, rang=8) # 这里只用来判断 goal_pre = feature_utils.extra_goal(act_abs_pre, obs_pre, rang=8) # 如果是放bomb if act_pre == 5: # 没有ammo放bomb if obs_pre['ammo'] == 0: reward -= 0.1 print_info('No ammo', '-0.1') # 如果有ammo else: nothing = True # 放的bomb可以波及到wood/enemy for r in range(8): for c in range(8): if my_bomb_life_now[(r, c)] > 0: if obs_pre['board'][(r, c)] in [wood]: reward += 0.2 nothing = False print_info('bomb -> wood', '+0.2') if obs_pre['board'][(r, c)] in [agent1, agent3]: reward += 0.3 nothing = False print_info('bomb -> enemy', '+0.3') # print(obs_pre['board'][(r, c)]) if obs_pre['board'][(r, c)] in [ incrrange, extrabomb, kick ]: reward -= 0.05 print_info('bomb -> powerup', '-0.05') if obs_pre['board'][(r, c)] in [teamate]: reward -= 0.05 print_info('bomb -> teammate', '-0.05') # if nothing: # reward -= 0.1 # # print(obs_pre['board'][(r, c)]) # print_info('Useless bomb', '-0.1') # 没有动 elif act_pre == 0: if obs_pre['position'] != goal_pre: print_info('obs_pre', obs_pre['position']) print_info('goal_pre', goal_pre) reward -= 0.1 # print(goal_pre) print_info('Faultal goal', '-0.1') # 如果是移动 else: # 踢炸弹获得奖励 if position_now != position_pre: if obs_pre['can_kick']: if obs_pre['board'][goal_pre] == bomb: reward += 0.01 print_info('Want to kick', '+0.01') if obs_pre['board'][position_now] == bomb: reward += 0.2 print_info('Kick', '+0.2') # 从安全位置进入到被炸弹波及之中 if bomb_life_pre[ position_pre] == 0 and bomb_life_now[position_now] > 0: reward -= 0.15 print_info('Enter the explosion range', '-0.15') # 被炸弹波及但是在向安全的位置移动 if bomb_life_pre[position_pre] > 0 and bomb_life_pre[goal_pre] == 0: reward += 0.05 print_info('Escape from the explosin range ', '+0.05') # 向着items移动 if obs_pre['board'][goal_pre] in [extrabomb, kick, incrrange]: reward += 0.01 print_info('Want a Item', '+0.01') # 吃到items if obs_pre['board'][position_now] in [ extrabomb, kick, incrrange ]: reward += 0.1 print_info('Eat a Item', '+0.1') # 吃到items elif obs_pre['board'][position_now] in [ extrabomb, kick, incrrange ]: reward += 0.05 print_info('Passing by a Item', '+0.05') if len(alive_agents) == 1: # An agent won. Give them +1, the other -1. print_info('Game Over', int(agents[0].is_alive)) return [reward + 2 * int(agent.is_alive) - 1 for agent in agents] elif step_count >= max_steps: # Game is over from time. Everyone gets -1. return [reward - 1] * 2 else: # Game running return [reward] * 2
def get_rewards_v3_8(agents, step_count, max_steps, whole_obs_pre, whole_obs, act_abs_pres, idx): # print_info('-------------------| Agent', str(idx) +' |-------------------') # print(act_abs_pres) def any_lst_equal(lst, values): """Checks if list are equal""" return any([lst == v for v in values]) alive_agents = [num for num, agent in enumerate(agents) \ if agent.is_alive] obs_pre = copy.deepcopy(whole_obs_pre[idx]) obs_now = copy.deepcopy(whole_obs[idx]) act_abs_pre = act_abs_pres[idx] position_pre = obs_pre['position'] position_now = obs_now['position'] bomb_life_now = feature_utils.get_bomb_life(obs_now) bomb_life_pre = feature_utils.get_bomb_life(obs_pre) my_bomb_life_now = feature_utils.get_my_bomb_life(bomb_life_now, position_now) extrabomb = constants.Item.ExtraBomb.value kick = constants.Item.Kick.value incrrange = constants.Item.IncrRange.value bomb = constants.Item.Bomb.value wood = constants.Item.Wood.value agent1 = constants.Item.Agent1.value agent3 = constants.Item.Agent3.value teamate = obs_pre['teammate'].value reward = 0 # 自己被炸死 if 0 < bomb_life_now[position_now] < 4: reward -= 1 print_info('You dead', '-1') act_pre = feature_utils._djikstra_act(obs_pre, act_abs_pre) # 这里只用来判断 goal_pre = feature_utils.extra_goal(act_abs_pre, obs_pre) # 如果是放bomb if act_pre == 5: # 没有ammo放bomb if obs_pre['ammo'] == 0: reward -= 0.1 print_info('No ammo', '-0.1') # 如果有ammo else: nothing = True # 放的bomb可以波及到wood/enemy for r in range(11): for c in range(11): if my_bomb_life_now[(r, c)] > 0: if obs_pre['board'][(r, c)] in [wood]: reward += 0.2 nothing = False print_info('bomb -> wood', '+0.2') if obs_pre['board'][(r, c)] in [agent1, agent3]: reward += 0.3 nothing = False print_info('bomb -> enemy', '+0.3') if obs_pre['board'][(r, c)] in [ incrrange, extrabomb, kick ]: reward -= 0.05 print_info('bomb -> powerup', '-0.05') if obs_pre['board'][(r, c)] in [teamate]: reward -= 0.05 print_info('bomb -> teammate', '-0.05') if nothing: reward -= 0.1 print_info('Useless bomb', '-0.1') # 没有动 elif act_pre == 0: if obs_pre['position'] != goal_pre: reward -= 0.1 print_info('Faultal goal', '-0.1') # 如果是移动 else: # 踢炸弹获得奖励 if position_now != position_pre: if obs_pre['can_kick']: if obs_pre['board'][goal_pre] == bomb: reward += 0.01 print_info('Want to kick', '+0.01') if obs_pre['board'][position_now] == bomb: reward += 0.2 print_info('Kick', '+0.2') # 从安全位置进入到被炸弹波及之中 if bomb_life_pre[ position_pre] == 0 and bomb_life_now[position_now] > 0: reward -= 0.15 print_info('Enter the explosion range', '-0.15') # 被炸弹波及但是在向安全的位置移动 if bomb_life_pre[position_pre] > 0 and bomb_life_pre[goal_pre] == 0: reward += 0.05 print_info('Escape from the explosin range ', '+0.05') # 向着items移动 if obs_pre['board'][goal_pre] in [extrabomb, kick, incrrange]: reward += 0.01 print_info('Want a Item', '+0.01') # 吃到items if obs_pre['board'][position_now] in [ extrabomb, kick, incrrange ]: reward += 0.1 print_info('Eat a Item', '+0.1') # 吃到items elif obs_pre['board'][position_now] in [ extrabomb, kick, incrrange ]: reward += 0.05 print_info('Passing by a Item', '+0.05') # We are playing a team game. if any_lst_equal(alive_agents, [[0, 2], [0], [2]]): # Team [0, 2] wins. print_info('Wins and agent0 alive.', reward + 1) return reward + 1 elif any_lst_equal(alive_agents, [[1, 3]]): # Team [1, 3] wins and no enemy dead. print_info('Loss and no enemy dead.', reward - 1) return reward - 1 elif any_lst_equal(alive_agents, [[1], [3]]): # Team [1, 3] wins and one enemy dead. print_info('Loss and one enemy dead.', reward - 0.6) return reward - 0.6 elif step_count >= max_steps and any_lst_equal( alive_agents, [[0, 1], [0, 1, 2], [0, 3], [0, 2, 3]]): # tie and one enemy dead. print_info('Tie and one enemy dead.', reward - 0.6) return reward - 0.6 elif step_count >= max_steps: # Game is over by max_steps. All agents tie. print_info('Game is over by max_steps. All agents tie.', reward - 1) return reward - 1 elif len(alive_agents) == 0: # Everyone's dead. All agents tie. print_info('Everyone is dead. All agents tie.', reward) return reward else: # No team has yet won or lost. return reward
def get_rewards_v3_7(agents, step_count, max_steps, whole_obs_pre, whole_obs, act_abs_pre): def any_lst_equal(lst, values): """Checks if list are equal""" return any([lst == v for v in values]) alive_agents = [num for num, agent in enumerate(agents) \ if agent.is_alive] obs_pre = copy.deepcopy(whole_obs_pre[0]) obs_now = copy.deepcopy(whole_obs[0]) position_pre = obs_pre['position'] position_now = obs_now['position'] bomb_life_now = feature_utils.get_bomb_life(obs_now) bomb_life_pre = feature_utils.get_bomb_life(obs_pre) my_bomb_life_now = feature_utils.get_my_bomb_life(bomb_life_now, position_now) extrabomb = constants.Item.ExtraBomb.value kick = constants.Item.Kick.value incrrange = constants.Item.IncrRange.value bomb = constants.Item.Bomb.value wood = constants.Item.Wood.value agent1 = constants.Item.Agent1.value agent3 = constants.Item.Agent3.value agent2 = constants.Item.Agent2.value e11_pre = feature_utils.extra_position(11, obs_pre['board']) e13_pre = feature_utils.extra_position(13, obs_pre['board']) e11_now = feature_utils.extra_position(11, obs_now['board']) e13_now = feature_utils.extra_position(13, obs_now['board']) reward = 0 # # 敌人被炸死 # if e11 is not None and 0 < bomb_life[e11] < 4: # reward += 0.5 # print_info('e11被炸死', '+0.5') # if e13 is not None and 0 < bomb_life[e13] < 4: # reward += 0.5 # print_info('e13被炸死', '+0.5') # 敌人从视野中消失: # if e11_now is None and e11_pre is not None: # reward -= 0.02 # print_info('敌人e11消失', '-0.01') # if e13_now is None and e13_pre is not None: # reward -= 0.02 # print_info('敌人e13消失', '-0.01') # if e11_pre is None and e11_now is not None: # reward += 0.01 # print_info('敌人e11出现', '+0.01') # if e13_pre is None and e13_now is not None: # reward += 0.01 # print_info('敌人e13出现', '+0.01') # 自己被炸死 if 0 < bomb_life_now[position_now] < 4: reward -= 1 print_info('You dead', '-1') act_pre = feature_utils._djikstra_act(obs_pre, act_abs_pre) # 这里只用来判断 goal_pre = feature_utils.extra_goal(act_abs_pre, obs_pre) # 如果是放bomb if act_pre == 5: # 没有ammo放bomb if obs_pre['ammo'] == 0: reward -= 0.1 print_info('No ammo', '-0.1') # 如果有ammo else: nothing = True # 放的bomb可以波及到wood/enemy for r in range(11): for c in range(11): if my_bomb_life_now[(r, c)] > 0: if obs_pre['board'][(r, c)] in [wood]: reward += 0.2 nothing = False print_info('bomb -> wood', '+0.2') if obs_pre['board'][(r, c)] in [agent1, agent3]: reward += 0.3 nothing = False print_info('bomb -> enemy', '+0.3') if obs_pre['board'][(r, c)] in [ incrrange, extrabomb, kick ]: reward -= 0.05 print_info('bomb -> powerup', '-0.05') if obs_pre['board'][(r, c)] in [agent2]: reward -= 0.05 print_info('bomb -> teammate', '-0.05') if nothing: reward -= 0.1 print_info('Useless bomb', '-0.1') # 没有动 elif act_pre == 0: if obs_pre['position'] != goal_pre: reward -= 0.1 print_info('Faultal goal', '-0.1') # 如果是移动 else: # r_pre, c_pre = position_pre # r_now, c_now = position_now # r_to = r_now - r_pre # c_to = c_now - c_pre # if (r_to, c_to) == (-1, 0): act_pre = 1 # if (r_to, c_to) == (1, 0): act_pre = 2 # if (r_to, c_to) == (0, -1): act_pre = 3 # if (r_to, c_to) == (0, 1): act_pre = 4 # 有效的移动 # reward += 0.001 # print_info('有效的移动', '+0.001') # 踢炸弹获得奖励 if position_now != position_pre: if obs_pre['can_kick']: if obs_pre['board'][goal_pre] == bomb: reward += 0.01 print_info('Want to kick', '+0.01') if obs_pre['board'][position_now] == bomb: reward += 0.2 print_info('Kick', '+0.2') # 从安全位置进入到被炸弹波及之中 if bomb_life_pre[ position_pre] == 0 and bomb_life_now[position_now] > 0: reward -= 0.15 print_info('Enter the explosion range', '-0.15') # 被炸弹波及但是在向安全的位置移动 if bomb_life_pre[position_pre] > 0 and bomb_life_pre[goal_pre] == 0: reward += 0.05 print_info('Escape from the explosin range ', '+0.05') # 向着items移动 if obs_pre['board'][goal_pre] in [extrabomb, kick, incrrange]: reward += 0.01 print_info('Want a Item', '+0.01') # 吃到items if obs_pre['board'][position_now] in [ extrabomb, kick, incrrange ]: reward += 0.1 print_info('Eat a Item', '+0.1') # 吃到items elif obs_pre['board'][position_now] in [ extrabomb, kick, incrrange ]: reward += 0.05 print_info('Passing by a Item', '+0.05') # We are playing a team game. if any_lst_equal(alive_agents, [[0, 2], [0], [2]]): # Team [0, 2] wins. print_info('Wins and agent0 alive.', reward + 1) return [reward + 1, -1, 1, -1] elif any_lst_equal(alive_agents, [[1, 3]]): # Team [1, 3] wins and no enemy dead. print_info('Loss and no enemy dead.', reward - 1) return [reward - 1, 1, -1, 1] elif any_lst_equal(alive_agents, [[1], [3]]): # Team [1, 3] wins and one enemy dead. print_info('Loss and one enemy dead.', reward - 0.6) return [reward - 0.6, 1, -1, 1] elif step_count >= max_steps and any_lst_equal( alive_agents, [[0, 1], [0, 1, 2], [0, 3], [0, 2, 3]]): # tie and one enemy dead. print_info('Tie and one enemy dead.', reward - 0.6) return [reward - 0.6, 1, -1, 1] elif step_count >= max_steps: # Game is over by max_steps. All agents tie. print_info('Game is over by max_steps. All agents tie.', reward - 1) return [reward - 1] * 4 elif len(alive_agents) == 0: # Everyone's dead. All agents tie. print_info('Everyone is dead. All agents tie.', reward) return [reward] * 4 else: # No team has yet won or lost. return [reward] * 4
def get_rewards_v3_6(agents, step_count, max_steps, whole_obs_pre, whole_obs, act_abs_pre): def any_lst_equal(lst, values): """Checks if list are equal""" return any([lst == v for v in values]) alive_agents = [num for num, agent in enumerate(agents) \ if agent.is_alive] obs_pre = copy.deepcopy(whole_obs_pre[0]) obs_now = copy.deepcopy(whole_obs[0]) position_pre = obs_pre['position'] position_now = obs_now['position'] bomb_life_now = feature_utils.get_bomb_life(obs_now) bomb_life_pre = feature_utils.get_bomb_life(obs_pre) my_bomb_life_now = feature_utils.get_my_bomb_life(bomb_life_now, position_now) extrabomb = constants.Item.ExtraBomb.value kick = constants.Item.Kick.value incrrange = constants.Item.IncrRange.value wood = constants.Item.Wood.value agent1 = constants.Item.Agent1.value agent3 = constants.Item.Agent3.value e11_now = feature_utils.extra_position(11, obs_now['board']) e13_now = feature_utils.extra_position(13, obs_now['board']) reward = 0 # # 敌人被炸死 # if e11 is not None and 0 < bomb_life[e11] < 4: # reward += 0.5 # print_info('e11被炸死', '+0.5') # if e13 is not None and 0 < bomb_life[e13] < 4: # reward += 0.5 # print_info('e13被炸死', '+0.5') # 自己被炸死 if 0 < bomb_life_now[position_now] < 4: reward -= 0.5 print_info('自己被炸死', '-0.5') act_pre = feature_utils._djikstra_act(obs_pre, act_abs_pre) # 这里只用来判断 goal_pre = feature_utils.extra_goal(act_abs_pre, obs_pre) # 如果是放bomb if act_pre == 5: # 没有ammo放bomb if obs_pre['ammo'] == 0: reward -= 0.1 print_info('没有ammo放炸弹', '-0.1') # 如果有ammo else: # 放的bomb可以波及到wood/enemy for r in range(11): for c in range(11): if my_bomb_life_now[(r, c)] > 0: if obs_pre['board'][(r, c)] in [wood]: reward += 0.2 print_info('炸弹波及到wood', '+0.2') if obs_pre['board'][(r, c)] in [agent1, agent3]: reward += 0.3 print_info('炸弹波及到敌人', '+0.3') # 没有动 elif act_pre == 0: if obs_pre['position'] != goal_pre: reward -= 0.01 print_info('无效移动', '-0.01') # 如果是移动 else: # 有效的移动 # reward += 0.001 # print_info('有效的移动', '+0.001') # 被炸弹波及但是在向安全的位置移动 if bomb_life_pre[position_pre] > 0 and bomb_life_pre[goal_pre] == 0: reward += 0.05 print_info('被炸弹波及向着安全的位置移动', '+0.05') # 向着items移动 if obs_pre['board'][goal_pre] in [extrabomb, kick, incrrange]: reward += 0.01 print_info('向items移动', '+0.01') # 吃到items if obs_pre['board'][position_now] in [extrabomb, kick, incrrange]: reward += 0.3 print_info('向着item移动并吃到items', '+0.2') # 吃到items elif obs_pre['board'][position_now] in [extrabomb, kick, incrrange]: reward += 0.05 print_info('路过吃到items', '+0.05') # We are playing a team game. if any_lst_equal(alive_agents, [[0, 2], [0], [2]]): # Team [0, 2] wins. print_info('Team [0, 2] wins and agent0 alive.', reward + 1) return [reward + 1, -1, 1, -1] elif any_lst_equal(alive_agents, [[1, 3]]): # Team [1, 3] wins and no enemy dead. print_info('Team [1, 3] wins and no enemy dead.', reward - 1) return [reward - 1, 1, -1, 1] elif any_lst_equal(alive_agents, [[1], [3]]): # Team [1, 3] wins and one enemy dead. print_info('Team [1, 3] wins and one enemy dead.', reward + 0.5) return [reward + 0.5, 1, -1, 1] elif step_count >= max_steps and any_lst_equal( alive_agents, [[0, 1], [0, 1, 2], [0, 3], [0, 2, 3]]): # tie and one enemy dead. print_info('tie and one enemy dead.', reward + 0.5) return [reward + 0.5, 1, -1, 1] elif step_count >= max_steps: # Game is over by max_steps. All agents tie. print_info('Game is over by max_steps. All agents tie.', reward - 1) return [reward - 1] * 4 elif len(alive_agents) == 0: # Everyone's dead. All agents tie. print_info('Everyone is dead. All agents tie.', reward + 0.5) return [reward + 0.5] * 4 else: # No team has yet won or lost. return [reward] * 4