break env.render() cnt = cnt + 1 action = RL.choose_action(observation) # 选行为 action_onehot = np.zeros(env.action_space[0].n) action_onehot[action] += 1.0 observation_, reward, done, info = env.step([action_onehot ]) # 获取下一个 state observation_ = observation_[0] reward = reward[0] done = done[0] # 保存这一组记忆 RL.store_transition(observation, action, reward, observation_) # 学习 RL.learn() observation = observation_ # display rewards for agent in env.world.agents: if not agent.name in reward_dic: reward_dic[agent.name] = [] reward_dic[agent.name].append(env._get_reward(agent)) print(agent.name + " reward: %0.3f" % env._get_reward(agent)) for agent in env.world.agents: y = reward_dic[agent.name] x = np.linspace(1, len(y), len(y)) plt.plot(x, y) plt.show()
# create interactive policies for each agent policies = [InteractivePolicy(env, i) for i in range(env.n)] # execution loop obs_n = env.reset() while True: # query for action from each agent's policy act_n = [] observation = obs_n[0] length = observation.shape[1] for i, policy in enumerate(policies): if i < length - 1: obs = observation[i, :] act_n.append(policy.action(obs)) if i == length - 1: act_n.append(policy.action(observation)) # step environment obs_n, reward_n, done_n, _ = env.step(act_n) # render all agent views env.render() #print action print(act_n) #get observation/state env_obs = obs_n[0] print(env_obs) #get reward env_reward = [] for agent in env.world.agents: env_reward.append(env._get_reward(agent)) print(env_reward)
# load scenario from script scenario = scenarios.load(args.scenario).Scenario() # create world world = scenario.make_world() # world is a class of world # create multiagent environment env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer=False) # render call to create viewer window (necessary only for interactive policies) env.render() # create interactive policies for each agent policies = [InteractivePolicy(env, i) for i in range(env.n) ] # create for each agent in the simulation world # execution loop obs_n = env.reset() while True: # query for action from each agent's policy act_n = [] for i, policy in enumerate(policies): act_n.append(policy.action(obs_n[i])) # step environment obs_n, reward_n, done_n, _ = env.step(act_n) # render all agent views env.render() # display rewards for agent in env.world.agents: print(agent.name + " reward: %0.3f" % env._get_reward(agent))
env.render() # create interactive policies for each agent policies = [InteractivePolicy(env,i) for i in range(env.n)] # execution loop obs_n = env.reset() rewards = np.zeros(len(env.world.agents)) print ('env.discrete_action_space:', env.discrete_action_space) while True: # query for action from each agent's policy act_n = [] for i, policy in enumerate(policies): act_n.append(policy.action(obs_n[i])) # step environment obs_n, reward_n, done_n, _ = env.step(act_n) # render all agent views env.render() # display rewards new_rewards = np.zeros(len(env.world.agents)) for i, agent in enumerate(env.world.agents): new_rewards[i] = env._get_reward(agent) if (np.abs(rewards - new_rewards) > .001).any(): print(rewards - new_rewards) rewards = new_rewards for i, r in enumerate(rewards): print('agent {} reward: {:.3f}'.format(i, r))