Пример #1
    def pick_action(self, obs, episode_id: int):

        # get list of tensors per q_net
        q_vals = []
        obs_tens = obs_to_tens(obs)
        for q_net, agent_obs in zip(self.q_nets, obs_tens):
            q_agent = q_net.forward(agent_obs)

        # sample random values per agent
        samples = [random.random() for _ in range(self.n_agents)]
        eps_threshold = get_eps_threshold(episode_id, self.max_episodes)

        # either arg_max or random uniform per agent
        actions = []
        for sample, agent_q_vals in zip(samples, q_vals):
                self._pick_agent_action(sample, eps_threshold, agent_q_vals))

        return actions
Пример #2
    def forward(self, obs):

        TODO optimize model and forward

        Get list of arrays with observation
        assert len(obs) == self.n_agents
        obs_t = obs_to_tens(obs, self.device)

        q_values = []
        for agent_obs, q_net in zip(obs, self.q_nets):
            q_vals = q_net(agent_obs)

        # should produce [BS=1, obs_size, n_agents]
        observations = torch.stack(obs_t)
        q_values_stacked = torch.stack(q_values)

        mixed_values = self.mixer.forward(q_values_stacked, observations)
        # TODO not tested!!
        return mixed_values
Пример #3
# Commented out IPython magic to ensure Python compatibility.
# %pdb on

from matplotlib import rc
rc('animation', html='jshtml')

steps_done = 0
all_rewards = []
all_eps_thresholds = []
all_losses = []

vcg_mech = VCG(agents)

for i_episode in tqdm(range(MAX_EPISODES)):

    obs = obs_to_tens(env.reset(), device)

    [agent.reset() for agent in agents]
    for agent, init_obs in zip(agents, obs):  # first state
        agent.last_obs = init_obs

    ep_losses = []
    ep_rewards = []
    ep_q_values = []

    for t in count():

        q_values_per_agent = []
        actions = []
        for agent, a_obs in zip(agents, obs):
            q_values = agent.q_values(a_obs)