class MADDPGAgent():
    def __init__(self, seed, checkpoint_filename=None):

        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, DEVICE, seed)
        self.t = 0

        self.agents = [
            DDPGAgent(index, NUM_AGENTS, seed, DEVICE)
            for index in range(NUM_AGENTS)
        ]

        if checkpoint_filename:
            for i, to_load in enumerate(self.agents):
                f"{os.getcwd()}/models/{checkpoint_filename}_actor_{i}.weights"
                actor_file = torch.load(
                    f"{os.getcwd()}/models/{checkpoint_filename}_actor_{i}.weights",
                    map_location=DEVICE)
                critic_file = torch.load(
                    f"{os.getcwd()}/models/{checkpoint_filename}_critic_{i}.weights",
                    map_location=DEVICE)
                to_load.actor_local.load_state_dict(actor_file)
                to_load.actor_target.load_state_dict(actor_file)
                to_load.critic_local.load_state_dict(critic_file)
                to_load.critic_target.load_state_dict(critic_file)
            print(f'Files loaded with prefix {checkpoint_filename}')

    def step(self, all_states, all_actions, all_rewards, all_next_states,
             all_dones):
        all_states = all_states.reshape(1, -1)
        all_next_states = all_next_states.reshape(1, -1)
        self.memory.add(all_states, all_actions, all_rewards, all_next_states,
                        all_dones)
        self.t = (self.t + 1) % UPDATE_FREQUENCY
        if self.t == 0 and (len(self.memory) > BATCH_SIZE):
            experiences = [self.memory.sample() for _ in range(NUM_AGENTS)]
            self.learn(experiences, GAMMA)

    def act(self, all_states, random):
        all_actions = []
        for agent, state in zip(self.agents, all_states):
            action = agent.act(state, random=random)
            all_actions.append(action)
        return np.array(all_actions).reshape(1, -1)

    def learn(self, experiences, gamma):
        all_actions = []
        all_next_actions = []
        for i, agent in enumerate(self.agents):
            states, _, _, next_states, _ = experiences[i]
            agent_id = torch.tensor([i]).to(DEVICE)
            state = states.reshape(-1, 2, 24).index_select(1,
                                                           agent_id).squeeze(1)
            next_state = next_states.reshape(-1, 2, 24).index_select(
                1, agent_id).squeeze(1)
            all_actions.append(agent.actor_local(state))
            all_next_actions.append(agent.actor_target(next_state))
        for i, agent in enumerate(self.agents):
            agent.learn(i, experiences[i], gamma, all_next_actions,
                        all_actions)
Пример #2
0
class MADDPG():
    def __init__(self, num_agents, state_size, action_size, random_seed):
        """ Initialize multiple Agents each with a Actor-Critic network
            but they share the replay buffer to learn from experience
        """
        self.num_agents = num_agents
        self.agents = []
        for _ in range(num_agents):
            agent = Agent(state_size, action_size, random_seed)
            self.agents.append(agent)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def act(self, states, add_noise=True):
        clipped_actions = []
        for state, agent in zip(states, self.agents):
            clipped_actions.append(agent.act(state, add_noise))
        return clipped_actions

    def reset(self):
        for agent in self.agents:
            agent.reset()

    def learn(self, experiences, gamma):
        for agent in self.agents:
            agent.learn(experiences, gamma)

    def saveCheckPoints(self):
        for i, agent in enumerate(self.agents):
            torch.save(agent.actor_local.state_dict(),
                       f"checkpoints/actor_agent_{i}.pth")
            torch.save(agent.critic_local.state_dict(),
                       f"checkpoints/critic_agent_{i}.pth")

    def loadCheckPoints(self):
        for i, agent in enumerate(self.agents):
            agent.actor_local.load_state_dict(
                torch.load(f"checkpoints/actor_agent_{i}.pth"))
            agent.critic_local.load_state_dict(
                torch.load(f"checkpoints/critic_agent_{i}.pth"))

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""

        # Save experience / reward
        for i in range(self.num_agents):
            self.memory.add(states[i], actions[i], rewards[i], next_states[i],
                            dones[i])
        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            for agent in self.agents:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)
Пример #3
0
class MADDPG:
    def __init__(self, num_agents=2, random_seed=1):  #np.random.randint(1000)
        super(MADDPG, self).__init__()

        self.maddpg_agent = [
            DDPGAgent(24, 16, 8, 2, 52, 42, 24, random_seed),
            DDPGAgent(24, 16, 8, 2, 52, 42, 24, random_seed)
        ]

        self.num_agents = num_agents

        # Replay memory
        action_size = 2
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def act(self, obs_all_agents, noise_ampl=1):
        """get actions from all agents in the MADDPG object"""
        actions = [
            agent.act(obs, noise_ampl)
            for agent, obs in zip(self.maddpg_agent, obs_all_agents)
        ]
        return actions

    def add_memory(self, state, action, reward, next_state, done):
        # Save experience / reward
        self.memory.num_agents = self.num_agents
        self.memory.add(state, action, reward, next_state, done)

    def step(self):
        """Save experience in replay memory, and use random sample from buffer to learn."""

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:

            for n in range(0, self.num_agents):

                experiences = self.memory.sample()

                self.maddpg_agent[n].step(experiences)

    def reset(self):
        for n in range(0, self.num_agents):
            self.maddpg_agent[n].reset()
class MultiAgent:
    """Interacts with and learns from the environment."""
    def __init__(self, agent_count, state_size, action_size, random_seed):
        """Initialize a MultiAgent object.

        Params
        ======
            agent_count (int): Number of agents
        """

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        self.agents = [
            Agent(
                memory=self.memory,
                state_size=state_size,
                action_size=action_size,
                random_seed=random_seed,
            ) for _ in range(agent_count)
        ]

    def step(self, states, actions, rewards, next_states, dones, timestep):
        # Save experience in replay memory
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and timestep % UPDATE_EVERY == 0:
            for agent in self.agents:
                agent.learn(self.memory.sample(), GAMMA)

    def act(self, all_states):
        """Get actions from all agents"""
        actions = [
            agent.act(np.expand_dims(states, axis=0))
            for agent, states in zip(self.agents, all_states)
        ]
        return actions

    def reset(self):
        for agent in self.agents:
            agent.reset()
Пример #5
0
    def update(self,
               buffer: ReplayBuffer,
               batchsize: int = 1000,
               tau: float = 0.005,
               discount: float = 0.98):

        states, actions, rewards, states_next, dones = buffer.sample(
            batchsize=batchsize)

        actions_next = self.target_actor(torch.stack(states_next).float())
        input_target_critic = torch.cat(
            [torch.stack(states_next).float(),
             actions_next.float()], axis=1)
        state_value = self.target_critic(input_target_critic)
        state_value.add_(torch.tensor(rewards).unsqueeze(1))
        state_value = state_value * discount * (1 -
                                                torch.tensor(dones).float())
        state_value.detach()

        input_critic = torch.cat(
            [torch.stack(states).float(),
             torch.stack(actions).float()],
            axis=1)
        state_value_local = self.critic(input_critic)

        critic_loss = (state_value -
                       state_value_local).pow(2).mul(0.5).sum(-1).mean()
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # update actor
        actions_new = self.actor(torch.stack(states).float())
        value_critic = self.critic(
            torch.cat([torch.stack(states).float(), actions_new], axis=1))
        loss_actor = -value_critic.mean()

        self.actor_optimizer.zero_grad()
        loss_actor.backward()
        self.actor_optimizer.step()
        soft_update(self.target_actor, self.actor, tau)
        soft_update(self.target_critic, self.critic, tau)
Пример #6
0
class MultiAgent:
    def __init__(self, state_size, action_size, num_agents, random_seed):
        self.agents = [
            DDPGAgent(state_size, action_size, random_seed)
            for _ in range(num_agents)
        ]
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   device, random_seed)
        self.t_step = 0

    def step_all(self, states, actions, rewards, next_states, dones):
        # Save experience in replay memory
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                for agent in self.agents:
                    experiences = self.memory.sample()
                    agent.learn(experiences, GAMMA)

    def act_all(self, multi_states):
        actions = [
            agent.act(np.expand_dims(states, axis=0))
            for agent, states in zip(self.agents, multi_states)
        ]
        return actions

    def save_weights_all(self):
        for index, agent in enumerate(self.agents):
            torch.save(agent.actor_local.state_dict(),
                       'agent{}_checkpoint_actor.pth'.format(index + 1))
            torch.save(agent.critic_local.state_dict(),
                       'agent{}_checkpoint_critic.pth'.format(index + 1))

    def reset_all(self):
        for agent in self.agents:
            agent.reset()
class MultiAgent:
    def __init__(self, config):

        self.random_seeds = config['random_seeds']
        self.params = config['params']
        self.memory = ReplayBuffer(self.params['action_size'],
                                   self.params['buffer_size'],
                                   self.params['batch_size'], device,
                                   self.random_seeds[0])
        self.params['memory'] = self.memory

        self.ddpg_agents = [
            Agent(self.params, self.random_seeds[i]) for i in range(2)
        ]

        self.t_step = 0

    def act(self, states):
        actions = [
            agent.act(np.expand_dims(state, axis=0))
            for agent, state in zip(self.ddpg_agents, states)
        ]
        #actions = [agent.act(states) for agent in self.ddpg_agents]
        return actions

    def step(self, states, actions, rewards, next_states, dones):
        self.t_step += 1

        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        if (len(self.memory) > self.params['batch_size']) and (
                self.t_step % self.params['num_steps_per_update'] == 0):
            for agent in self.ddpg_agents:
                experiences = self.memory.sample()
                agent.learn(experiences, self.params['gamma'])

    def reset(self):
        for agent in self.ddpg_agents:
            agent.reset()
Пример #8
0
class DQNAgent:
    def __init__(self, env, state_size, action_size, batch_size, gamma, lr,
                 update_every, tau, eps_start, eps_end, eps_decay, seed):

        for key, value in locals().items():
            if key != 'self':
                setattr(self, key, value)

        random.seed(seed)
        torch.manual_seed(seed)
        np.random.seed(seed)

        self.Q_target = LinearModel(state_size, action_size)
        self.Q_local = LinearModel(state_size, action_size)

        self.memory = ReplayBuffer(batch_size=batch_size)
        self.optim = torch.optim.Adam(self.Q_local.parameters(), lr=lr)

        self.update_counter = 0

    def env_reset(self, train_mode=True):
        return self.env.reset()

    def env_step(self, action):
        return self.env.step(action)

    def env_render(self, train_mode=False):
        return self.env.render()

    def env_close(self, train_mode=True):
        if not train_mode:
            return self.env.close()

    def get_action(self, state, epsilon=0.):
        if random.random() < epsilon:
            return np.random.choice(np.arange(self.action_size))

        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        self.Q_local.eval()
        with torch.no_grad():
            action = np.argmax(self.Q_local(state).data.numpy())
        return action

    def step(self, state, action, reward, next_state, done):
        self.memory.store(
            (state, action, reward, next_state, 1 if done else 0))

        self.update_counter = (self.update_counter + 1) % self.update_every
        if self.update_counter == 0:
            self.update_Q()

    def update_Q(self):
        states, actions, rewards, next_states, dones = self.memory.sample()

        Q_target_next = self.Q_target(next_states).detach().max(
            dim=1, keepdim=True)[0]
        Q_target_pred = rewards + self.gamma * Q_target_next * (1.0 - dones)
        self.Q_local.eval()
        Q = self.Q_local(states).gather(1, actions)

        loss = F.mse_loss(Q, Q_target_pred)
        self.Q_local.train()
        self.Q_local.zero_grad()
        loss.backward()
        self.optim.step()

        for t_param, l_param in zip(self.Q_target.parameters(),
                                    self.Q_local.parameters()):
            t_param.data.copy_(self.tau * l_param.data +
                               (1.0 - self.tau) * t_param.data)

    def train(self, num_episodes, max_t=1000, is_finished=None, render=False):
        scores = []
        eps = self.eps_start

        for i in range(num_episodes):
            state = self.env_reset(train_mode=True)
            score = 0
            for _ in range(max_t):
                action = self.get_action(state, eps)
                if render: self.env_render(train_mode=True)
                next_state, reward, done, _ = self.env_step(action)
                self.step(state, action, reward, next_state, done)
                score += reward
                state = next_state
                if done: break

            eps = max(self.eps_end, eps * self.eps_decay)
            scores.append(score)
            if is_finished and is_finished(scores, num_episodes):
                break
        if render: self.env_close(train_mode=False)
        return scores

    def run(self, num_episodes=1, max_t=1000, render=None):
        if render == None: render = num_episodes == 1
        scores = []
        for i in range(num_episodes):
            state = self.env_reset(train_mode=False)
            score = 0
            for _ in range(max_t):
                action = self.get_action(state)
                if render: self.env_render(train_mode=False)
                next_state, reward, done, _ = self.env_step(action)
                score += reward
                state = next_state
                if done: break

            scores.append(score)
            if render: self.env_close(train_mode=False)
        return scores
Пример #9
0
class DDQNAgent:
    def __init__(self, config: Config, training=True):
        self.config = config
        self.is_training = training
        self.buffer = ReplayBuffer(self.config.max_buff)

        self.model = DQN(self.config.state_shape, self.config.action_dim)
        self.target_model = DQN(self.config.state_shape,
                                self.config.action_dim)
        self.target_model.load_state_dict(self.model.state_dict())

        self.optim = Adam(self.model.parameters(),
                          lr=self.config.learning_rate)

        self.model.cuda()
        self.target_model.cuda()

    def act(self, state, epsilon=None):
        if epsilon is None: epsilon = self.config.epsilon_min
        if random.random() > epsilon or not self.is_training:
            state = torch.tensor(state, dtype=torch.float).unsqueeze(0)
            state = state.cuda()
            q_value = self.model.forward(state)
            action = q_value.max(1)[1].item()
        else:
            action = random.randrange(self.config.action_dim)
        return action

    def learn(self, t):
        s, a, r, s2, done = self.buffer.sample(self.config.batch_size)

        s = torch.tensor(s, dtype=torch.float)
        a = torch.tensor(a, dtype=torch.long)
        r = torch.tensor(r, dtype=torch.float)
        s2 = torch.tensor(s2, dtype=torch.float)
        done = torch.tensor(done, dtype=torch.float)

        s = s.cuda()
        a = a.cuda()
        r = r.cuda()
        s2 = s2.cuda()
        done = done.cuda()

        q_values = self.model(s).cuda()
        next_q_values = self.model(s2).cuda()
        next_q_state_values = self.target_model(s2).cuda()

        q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1)
        next_q_value = next_q_state_values.gather(
            1,
            next_q_values.max(1)[1].unsqueeze(1)).squeeze(1)
        expected_q_value = r + self.config.gamma * next_q_value * (1 - done)

        loss = (q_value - expected_q_value.detach()).pow(2).mean()

        self.optim.zero_grad()
        loss.backward()
        self.optim.step()

        if t % self.config.update_interval == 0:
            self.target_model.load_state_dict(self.model.state_dict())

        return loss.item()

    def load_weights(self, model_path):
        model = torch.load(model_path)
        if 'model' in model:
            self.model.load_state_dict(model['model'])
        else:
            self.model.load_state_dict(model)

    def save_checkpoint(self):
        os.makedirs('ckpt', exist_ok=True)
        torch.save(self.model.state_dict(), 'ckpt/model.pt')

    def load_checkpoint(self):
        self.model.load_state_dict('ckpt/model.pt')
        self.target_model.load_state_dict('ckpt/model.pt')
Пример #10
0
class MADDPG_Trainer:
    def __init__(self, n_agents, act_spcs, ob_spcs, writer, args):
        self.args = args
        self.memory = ReplayBuffer(args.buffer_length, n_agents, device)
        self.epsilon_scheduler = LinearSchedule(E_GREEDY_STEPS,
                                                FINAL_STD,
                                                INITIAL_STD,
                                                warmup_steps=WARMUP_STEPS)
        self.n_agents = n_agents
        self.act_spcs = act_spcs
        self.ob_spcs = ob_spcs
        self.agents = [
            DDPG_agent(self.act_spcs[i], self.ob_spcs[i], np.sum(self.ob_spcs),
                       np.sum(self.act_spcs)) for i in range(n_agents)
        ]

        self.n_steps = 0
        self.n_updates = 0
        self.writer = writer
        self.criterion = nn.MSELoss()

    def get_actions(self, states):
        return [
            agent.select_action(state)[0]
            for agent, state in zip(self.agents, states)
        ]

    def store_transitions(self, states, actions, rewards, next_states, dones):
        self.memory.add(states, actions, rewards, next_states, dones)

    def reset(self):
        pass

    def transform_states(self, states, N):
        obses = []
        for i in range(N):
            states_ = []
            for j in range(self.n_agents):
                states_.append(states[j][i])
            obses.append(torch.cat([f.float().to(device) for f in states_]))
        return torch.stack(obses)

    def transform_actions(self, actions, N):
        acts = []
        for i in range(N):
            actions_ = []
            for j in range(self.n_agents):
                actions_.append(actions[j][i])
            acts.append(torch.cat([f.float().to(device) for f in actions_]))
        return torch.stack(acts)

    def update_all_targets(self):
        for agent in self.agents:
            soft_update(agent.policy_targ, agent.policy, TAU)
            soft_update(agent.qnet_targ, agent.qnet, TAU)

    def prep_training(self):
        for agent in self.agents:
            agent.qnet.train()
            agent.policy.train()
            agent.qnet_targ.train()
            agent.policy_targ.train()

    def eval(self):
        for agent in self.agents:
            agent.qnet.eval()
            agent.policy.eval()
            agent.qnet_targ.eval()
            agent.policy_targ.eval()

    def sample_and_train(self, batch_size):
        # TODO ADD Model saving, optimize code
        batch = self.memory.sample(min(batch_size, len(self.memory)))

        states_i, actions_i, rewards_i, next_states_i, dones_i = batch

        states_all = torch.cat(states_i, 1)
        next_states_all = torch.cat(next_states_i, 1)
        actions_all = torch.cat(actions_i, 1)

        for i, agent in enumerate(self.agents):
            next_actions_all = [
                onehot_from_logits(ag.policy_targ(next_state))
                for ag, next_state in zip(self.agents, next_states_i)
            ]
            # computing target
            total_obs = torch.cat(
                [next_states_all,
                 torch.cat(next_actions_all, 1)], 1)
            target_q = self.agents[i].qnet_targ(total_obs).detach()
            rewards = rewards_i[i].view(-1, 1)
            dones = dones_i[i].view(-1, 1)
            target_q = rewards + (1 - dones) * GAMMA * target_q

            # computing the inputs
            input_q = self.agents[i].qnet(
                torch.cat([states_all, actions_all], 1))
            self.agents[i].q_optimizer.zero_grad()
            loss = self.criterion(input_q, target_q.detach())
            # print("LOSS", loss)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.agents[i].qnet.parameters(),
                                           0.5)
            self.agents[i].q_optimizer.step()
            actor_loss = 0
            # ACTOR gradient ascent of Q(s, π(s | ø)) with respect to ø

            # use gumbel softmax max temp trick
            policy_out = self.agents[i].policy(states_i[i])
            gumbel_sample = gumbel_softmax(policy_out, hard=True)

            actions_curr_pols = [
                onehot_from_logits(agent_.policy(state))
                for agent_, state in zip(self.agents, states_i)
            ]

            for action_batch in actions_curr_pols:
                action_batch.detach_()
            actions_curr_pols[i] = gumbel_sample

            actor_loss = -self.agents[i].qnet(
                torch.cat(
                    [states_all.detach(),
                     torch.cat(actions_curr_pols, 1)], 1)).mean()
            actor_loss += (policy_out**2).mean() * 1e-3

            self.agents[i].p_optimizer.zero_grad()
            actor_loss.backward()
            # nn.utils.clip_grad_norm_(self.policy.parameters(), 5)
            torch.nn.utils.clip_grad_norm_(self.agents[i].policy.parameters(),
                                           0.5)
            self.agents[i].p_optimizer.step()
            # detach the forward propagated action samples
            actions_i[i].detach_()

            if self.args.use_writer:
                self.writer.add_scalars("Agent_%i" % i, {
                    "vf_loss": loss,
                    "actor_loss": actor_loss
                }, self.n_updates)

        self.update_all_targets()
        self.n_updates += 1
Пример #11
0
class MADDPG():
    """Agent that contains the two DDPG agents and shared replay buffer."""
    def __init__(self, action_size=2, n_agents=2, seed=0):
        """
        Params
        ======
            action_size (int): dimension of each action
            seed (int): Random seed
            n_agents (int): number of agents
        """

        self.n_agents = n_agents
        self.t_step = 0
        self.noise_on = True

        # create two agents, each with their own actor and critic
        models = [
            model.Actor_Critic_Models(n_agents=n_agents)
            for _ in range(n_agents)
        ]
        self.agents = [DDPG(i, models[i]) for i in range(n_agents)]

        # create shared replay buffer
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

    def step(self, all_states, all_actions, all_rewards, all_next_states,
             all_dones):
        all_states = all_states.reshape(1, -1)
        all_next_states = all_next_states.reshape(1, -1)
        self.memory.add(all_states, all_actions, all_rewards, all_next_states,
                        all_dones)

        self.t_step = self.t_step + 1
        if self.t_step % UPDATE_EVERY == 0:
            if len(self.memory) > BATCH_SIZE:
                experiences = [
                    self.memory.sample() for _ in range(self.n_agents)
                ]
                self.learn(experiences, GAMMA)

    def act(self, all_states, add_noise=True):
        # pass each agent's state from the environment and calculate its action
        all_actions = []
        for agent, state in zip(self.agents, all_states):
            action = agent.act(state, add_noise=self.noise_on)
            #self.noise_weight *= noise_decay
            all_actions.append(action)
        return np.array(all_actions).reshape(
            1, -1)  # reshape 2x2 into 1x4 dim vector

    def learn(self, experiences, gamma):
        all_next_actions = []
        all_actions = []
        for i, agent in enumerate(self.agents):
            states, _, _, next_states, _ = experiences[i]
            agent_id = torch.tensor([i]).to(device)
            # extract agent i's state and get action via actor network
            state = states.reshape(-1, 2, 24).index_select(1,
                                                           agent_id).squeeze(1)
            action = agent.actor_local(state)
            all_actions.append(action)
            # extract agent i's next state and get action via target actor network
            next_state = next_states.reshape(-1, 2, 24).index_select(
                1, agent_id).squeeze(1)
            next_action = agent.actor_target(next_state)
            all_next_actions.append(next_action)

        for i, agent in enumerate(self.agents):
            agent.learn(i, experiences[i], gamma, all_next_actions,
                        all_actions)

    def save_agents(self):
        for i, agent in enumerate(self.agents):
            torch.save(agent.actor_local.state_dict(),
                       f"checkpoint_actor_agent_{i}.pth")
            torch.save(agent.critic_local.state_dict(),
                       f"checkpoint_critic_agent_{i}.pth")
Пример #12
0
        noise = noise if hard_noise_reigime else noise * NOISE_DECAY

        # END EPISODE IF ANY AGENT IS DONE
        if any(dones):
            break

    if episode_i > HARD_NOISE_STEPS:
        hard_noise_reigime = False

    # POTENTIALLY START TAKING SAMPLES TO TRAIN FROM EXPERIENCE BUFFER
    if len(buffer) > MIN_BUFFER_SIZE:
        update_flag = "u"
        for _ in range(N_BATCHES_PER_UPDATE):
            for agent_i in range(N_AGENTS):
                # samples = buffer.sample(3)
                samples = buffer.sample(BATCH_SIZE)
                maddpg.update(samples, agent_i)
                if UPDATE_TARGET_AFTER_EACH_BATCH:
                    maddpg.update_targets()
            if not UPDATE_TARGET_AFTER_EACH_BATCH:
                maddpg.update_targets()
    else:
        update_flag = " "

    # UPDATE EPISODE AND ROLLING MEAN SCORES
    agg_reward_this_episode = np.max(rewards_this_episode)
    rewards_deque.append(agg_reward_this_episode)
    rolling_mean_reward = np.mean(rewards_deque)

    history.append(agg_reward_this_episode)
    history_rolling_mean.append(rolling_mean_reward)
Пример #13
0
class MADDPG(object):
    """
    The main class that defines and trains all the DDPG agents.
    """
    def __init__(
        self,
        num_agents,
        state_size,
        action_size,
        buffer_size=int(1e6),
        batch_size=128,
        writer=None,
        actor_hidden_sizes=(256, 128),
        actor_lr=1e-4,
        actor_weight_decay=0.,
        critic_hidden_sizes=(256, 128),
        critic_lr=1e-3,
        critic_weight_decay=0.,
        model_folder_path=None,
    ):
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size

        self.full_state_size = num_agents * state_size
        self.full_action_size = num_agents * action_size

        # Replay memory
        self.memory = ReplayBuffer(buffer_size)

        # TensorboardX Writer
        self.writer = writer

        # Actor Network Parameters
        self.actor_hidden_sizes = actor_hidden_sizes
        self.actor_lr = actor_lr
        self.actor_weight_decay = actor_weight_decay

        # Critic Network Parameters
        self.critic_hidden_sizes = critic_hidden_sizes
        self.critic_lr = critic_lr
        self.critic_weight_decay = critic_weight_decay

        # Model Folder
        self.folder_path = Path() if model_folder_path is None else Path(
            model_folder_path)

        # MADDPG Agents
        self.agents = []
        self._init_agents()

    def reset(self):
        for agent in self.agents:
            agent.reset()

    def act(self, states, noise=0.):
        return [
            agent.act(obs, noise) for agent, obs in zip(self.agents, states)
        ]

    def step(self,
             i_episode,
             states,
             actions,
             rewards,
             next_states,
             dones,
             tau=0.01,
             num_learns=1):

        # save to replay buffer
        self.memory.add(states, actions, rewards, next_states, dones)

        # train the model
        if len(self.memory) >= self.batch_size and num_learns > 0:
            actor_loss_list, critic_loss_list = [], []

            for _ in range(num_learns):  # learn multiple times at every step
                states, actions, rewards, next_states, dones = self.memory.sample(
                    self.batch_size)

                for agent_id in range(self.num_agents):
                    # Learn one time for the agents
                    actor_loss, critic_loss = self._learn(
                        agent_id, states, actions, next_states, rewards, dones)

                    actor_loss_list.append(actor_loss)
                    critic_loss_list.append(critic_loss)

            # Record Losses for actor & critic
            if self.writer:
                for agent_id in range(self.num_agents):
                    self.writer.add_scalars(
                        f'agent{agent_id}/losses', {
                            'critic loss': np.mean(critic_loss_list),
                            'actor_loss': np.mean(actor_loss_list)
                        }, i_episode)

            # Soft update
            self._update_all(tau)

    def save(self):
        for agent in self.agents:
            torch.save(
                agent.actor_local.state_dict(),
                self.folder_path / f'checkpoint_actor_local_{agent.id}.pth')
            torch.save(
                agent.critic_local.state_dict(),
                self.folder_path / f'checkpoint_critic_local_{agent.id}.pth')

    def load(self, agent_id=None):
        for agent in self.agents:
            agent_id_ = agent.id if agent_id is None else agent_id
            agent.actor_local.load_state_dict(
                torch.load(self.folder_path /
                           f'checkpoint_actor_local_{agent_id_}.pth'))
            agent.critic_local.load_state_dict(
                torch.load(self.folder_path /
                           f'checkpoint_critic_local_{agent_id_}.pth'))

    def _init_agents(self):
        for i in range(self.num_agents):
            agent = DDPG(i, self.state_size, self.full_state_size,
                         self.action_size, self.full_action_size,
                         self.actor_hidden_sizes, self.actor_lr,
                         self.actor_weight_decay, self.critic_hidden_sizes,
                         self.critic_lr, self.critic_weight_decay)
            self.agents.append(agent)

    def _learn(self, agent_id, states, actions, next_states, rewards, dones):

        critic_full_actions, critic_full_next_actions = [], []
        for agent in self.agents:
            # current actions
            actor_actions = agent.actor_local(states[:, agent.id, :])
            critic_full_actions.append(actor_actions)

            # next actions
            actor_next_actions = agent.actor_target.forward(
                next_states[:, agent.id, :])
            critic_full_next_actions.append(actor_next_actions)

        # learn for the agent
        current_agent = self.agents[agent_id]
        actor_loss, critic_loss = current_agent.learn(
            states, actions, rewards, next_states, dones, critic_full_actions,
            critic_full_next_actions)
        return actor_loss, critic_loss

    def _update_all(self, tau):
        for agent in self.agents:
            agent.update(agent.actor_local, agent.actor_target, tau)
            agent.update(agent.critic_local, agent.critic_target, tau)
Пример #14
0
def main():
    seeding()
    # number of parallel agents
    parallel_envs = 4
    # number of training episodes.
    # change this to higher number to experiment. say 30000.
    number_of_episodes = 1000
    episode_length = 80
    batchsize = 1000
    # how many episodes to save policy and gif
    save_interval = 1000
    t = 0

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 2
    noise_reduction = 0.9999

    # how many episodes before update
    episode_per_update = 2 * parallel_envs

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    torch.set_num_threads(parallel_envs)
    env = envs.make_parallel_env(parallel_envs)

    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(5000 * episode_length))

    # initialize policy and critic
    maddpg = MADDPG()
    logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []
    agent2_reward = []

    # training loop
    # show progressbar
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    # use keep_awake to keep workspace from disconnecting
    for episode in range(0, number_of_episodes + parallel_envs, parallel_envs):

        timer.update(episode)

        reward_this_episode = np.zeros((parallel_envs, 3))
        all_obs = env.reset()
        obs, obs_full = transpose_list(all_obs)

        # for calculating rewards for this particular episode - addition of all time steps

        # save info or not
        save_info = (episode % save_interval < parallel_envs)
        frames = []
        tmax = 0

        if save_info:
            frames.append(env.render('rgb_array'))

        for episode_t in range(episode_length):

            t += parallel_envs

            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)
            noise *= noise_reduction

            actions_array = torch.stack(actions).detach().numpy()

            # transpose the list of list
            # flip the first two indices
            # input to step requires the first index to correspond to number of parallel agents
            actions_for_env = np.rollaxis(actions_array, 1)

            # step forward one frame
            next_obs, next_obs_full, rewards, dones, info = env.step(
                actions_for_env)

            # add data to buffer
            transition = (obs, obs_full, actions_for_env, rewards, next_obs,
                          next_obs_full, dones)

            buffer.push(transition)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

            # save gif frame
            if save_info:
                frames.append(env.render('rgb_array'))
                tmax += 1

        # update once after every episode_per_update
        if len(buffer
               ) > batchsize and episode % episode_per_update < parallel_envs:
            for a_i in range(3):
                samples = buffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            maddpg.update_targets(
            )  # soft update the target network towards the actual networks

        for i in range(parallel_envs):
            agent0_reward.append(reward_this_episode[i, 0])
            agent1_reward.append(reward_this_episode[i, 1])
            agent2_reward.append(reward_this_episode[i, 2])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            avg_rewards = [
                np.mean(agent0_reward),
                np.mean(agent1_reward),
                np.mean(agent2_reward)
            ]
            agent0_reward = []
            agent1_reward = []
            agent2_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  avg_rew, episode)

        # saving model
        save_dict_list = []
        if save_info:
            for i in range(3):

                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # save gif files
            imageio.mimsave(os.path.join(model_dir,
                                         'episode-{}.gif'.format(episode)),
                            frames,
                            duration=.04)

    env.close()
    logger.close()
    timer.finish()
Пример #15
0
class SoftActorCriticAgent():
    def __init__(self):        
        torch.autograd.set_detect_anomaly(True)
        self.conv_net = ConvNetwork()
        self.critic_v = StateValueNetwork()
        self.critic_v_target = StateValueNetwork()
        self.critic_q_1 = ActionValueNetwork()
        self.critic_q_2 = ActionValueNetwork()
        self.actor = PolicyNetwork()
        self.actor_optim = optim.Adam(self.actor.parameters(), lr=3*10e-4) #0.003
        self.v_optim = optim.Adam(self.critic_v.parameters(), lr=0.003)
        self.q1_optim = optim.Adam(self.critic_q_1.parameters(), lr=0.003)
        self.q2_optim = optim.Adam(self.critic_q_2.parameters(), lr=0.003)        
        self.gamma = 0.99
        self.tau = 0.005
        self.batch_size = 16 #256
        self.reward_scale = 10
        self.replay_buffer = ReplayBuffer(self.batch_size)
        self.update_target(1)

    def select_actions(self, state):    
        self.actor.eval()
        self.conv_net.eval()
        with torch.no_grad():    
            state = self.conv_net(state.unsqueeze(0))
            mean, log_variance = self.actor.forward(state)
            variance = log_variance.exp()
            gaussian = Normal(mean, variance)        
            z = gaussian.sample()
            actions = torch.tanh(z)
            actions = actions.cpu().detach().squeeze(0)
            dim1 = actions[0:3]
            dim1_p = F.softmax(dim1, 0)
            action1 = torch.argmax(dim1_p)
            dim2 = actions[3:6]
            dim2_p = F.softmax(dim2, 0)
            action2 = torch.argmax(dim2_p)
            dim3 = actions[6:8]
            dim3_p = F.softmax(dim3, 0)
            action3 = torch.argmax(dim3_p)
            dim4 = actions[8:11]
            dim4_p = F.softmax(dim4, 0)
            action4 = torch.argmax(dim4_p)
            actions_env_format = [action1.item(), action2.item(), action3.item(), action4.item()]
        self.actor.train()
        self.conv_net.train()
        return actions, numpy.array(actions_env_format)

    def train(self):   
        if(len(self.replay_buffer.replay_buffer) < self.batch_size):  
            return  
        states, actions, rewards, next_states, dones = self.replay_buffer.sample()

        states = self.conv_net(states).detach()
        next_states = self.conv_net(next_states).detach()

        current_q_1 = self.critic_q_1(states, actions)
        current_q_2 = self.critic_q_2(states, actions)
        current_critic_v = self.critic_v(states)
        mean, variance, z, log_pi = self.actor.sample(states)
        policy_actions = torch.tanh(z)

        # r(st,at) +γEst+1∼p[V ̄ψ(st+1)],
        target_q = rewards * self.reward_scale + (self.gamma * self.critic_v_target(next_states) * (1-dones)) 
        q1_loss = F.mse_loss(current_q_1, target_q.detach()) 
        q2_loss = F.mse_loss(current_q_2, target_q.detach())
        self.q1_optim.zero_grad()
        q1_loss.backward()
        self.q1_optim.step()
        self.q2_optim.zero_grad()
        q2_loss.backward()
        self.q2_optim.step()

        q1 = self.critic_q_1(states, policy_actions)
        q2 = self.critic_q_2(states, policy_actions)
        predicted_new_q = torch.min(q1, q2)

        # Eat∼πφ[Qθ(st,at)−logπφ(at|st)]
        target_critic_v = predicted_new_q - log_pi
        critic_loss = F.mse_loss(current_critic_v, target_critic_v.detach())
        self.v_optim.zero_grad()
        critic_loss.backward()
        self.v_optim.step()

        actor_loss = (log_pi - predicted_new_q).mean()
        self.actor_optim.zero_grad()
        actor_loss.backward()
        self.actor_optim.step()
        self.update_target(self.tau)

    def update_target(self, tau):
        for target_param, param in zip(self.critic_v_target.parameters(), self.critic_v.parameters()):
            target_param.data.copy_(tau * param.data + (1-tau) * target_param.data)
Пример #16
0
def main():
    seeding()
    # number of parallel agents
    parallel_envs = 4
    # number of training episodes.
    # change this to higher number to experiment. say 30000.
    number_of_episodes = 10000
    episode_length = 100
    batchsize = 1000
    # how many episodes to save policy and gif
    save_interval = 5000
    # what is this ?
    t = 0

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 2
    noise_reduction = 0.9999

    # how many episodes before update
    episode_per_update = 2 * parallel_envs

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    torch.set_num_threads(parallel_envs)
    # this may be a list of all environments
    env = envs.make_parallel_env(parallel_envs)

    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(5000 * episode_length))

    # initialize policy and critic
    # this creates a list of models, each element in the list refers to an agent in the simulation
    # [agent_one_ddpg, agent_two_ddpg, ...]
    # agent_one_ddpg contains the agent actor and critic models,e.g., agent_one_ddpg.actor, agent_one_ddpg.critic
    maddpg = MADDPG()
    logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []
    agent2_reward = []

    # training loop
    # show progressbar
    import progressbar as pb
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    # use keep_awake to keep workspace from disconnecting
    # for episode in keep_awake(range(0, number_of_episodes, parallel_envs)):
    # notice we jump forward by number of parallel environments
    for episode in range(0, number_of_episodes, parallel_envs):
        timer.update(episode)

        # i believe there are as many as number of agents times parallel env reward
        reward_this_episode = np.zeros((parallel_envs, 3))
        # obs is the observation state space of all the three agents in the 4 parallel env.
        # for the Physical Dception environment with three agents it is of dimension 4x3x14.
        # obs_full is world state irrespective of the agents and its dimension is 4x14.
        # all_observation = array(number of environments 4, 2 elements)
        # element 0 : is a list that contains 3 arrays. contains the state for each agent, each state is of size 14
        # element 1 : global state from the perspective of the target/green for its environment. contains 14 elements
        all_obs = env.reset()
        # obs : is a list that has 1 element per environment. each element contains a list of 3 array.
        # each array is the state of one agent in that environment.
        # obs_full: is the god eye view of each environment. So it a list, that has 1 element per environment
        # each element contains an array of 14 values which is the global state of that environment
        obs, obs_full = transpose_list(all_obs)

        #for calculating rewards for this particular episode - addition of all time steps

        # save info or not
        save_info = (episode % save_interval < parallel_envs
                     or episode == number_of_episodes - parallel_envs)
        frames = []
        tmax = 0

        if save_info:
            frames.append(env.render('rgb_array'))

        for episode_t in range(episode_length):
            # we finish the episode before sampling the buffer for trainint
            # t jumps forward in a multiple of environment
            t += parallel_envs

            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            # the transpose_to_tensor(obs) changes the data to each agent point of view
            # since we have 4 environments, there are 4 agent 1, 4 agent 2, and 4 agent 3
            # each agent has a state in each environment, total states across 4 environments for agent 1 is 4x14 tensor
            # transpose_to_tensor(obs) = is a list of 3 elements. each element is for 1 agent
            # pick element 1. this is an array of 4x14 elements of agent observation across 4 environments.
            # maddpg.act has a for loop that take each element of obs and pass it to the agents actor models and
            # to generate an action from each agent actor.
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)
            noise *= noise_reduction
            # there are 4 actions per agent and 3 agents, total of 12 . Each action has 2 elements force in x, y direct
            # actions_array is a tensor of shape (3 agent, 4 env, 2 action)
            actions_array = torch.stack(actions).detach().numpy()

            # transpose the list of list
            # flip the first two indices
            # input to step requires the first index to correspond to number of parallel agents
            # the shape of actions_for_env is (4 env, 3 agent, 2 action)
            actions_for_env = np.rollaxis(actions_array, 1)

            # step forward one frame
            # obs is the observation state space of all the three agents in the 4 parallel env.
            # for the Physical Dception environment with three agents it is of dimension 4x3x14.
            # obs_full is world state irrespective of the agents and its dimension is 4x14.
            # To gain more understanding, please see the code in the multiagent folder.
            next_obs, next_obs_full, rewards, dones, info = env.step(
                actions_for_env)

            # add data to buffer
            transition = (obs, obs_full, actions_for_env, rewards, next_obs,
                          next_obs_full, dones)

            buffer.push(transition)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

            # save gif frame
            if save_info:
                frames.append(env.render('rgb_array'))
                tmax += 1

        # update once after every episode_per_update
        if len(buffer
               ) > batchsize and episode % episode_per_update < parallel_envs:
            for a_i in range(3):
                # although samples are drawn randomly, for each sample we have all 3 agents data, and we know which
                # reward and actions belong to which agent
                # samples is a list of 7 elements: obs, obs_full, action, reward, next_obs, next_obs_full, done
                # each element of sample, say samples[0] is a list of 3 elements, one for each agent
                # each agent element contains their corresponding value, for example in case of obs it would be a
                # vector with 14 values
                # so when i ask for 2 samples for examples, i get 2 samples each containing all 3 agents states, rewards
                samples = buffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            maddpg.update_targets(
            )  #soft update the target network towards the actual networks

        for i in range(parallel_envs):
            agent0_reward.append(reward_this_episode[i, 0])
            agent1_reward.append(reward_this_episode[i, 1])
            agent2_reward.append(reward_this_episode[i, 2])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            avg_rewards = [
                np.mean(agent0_reward),
                np.mean(agent1_reward),
                np.mean(agent2_reward)
            ]
            agent0_reward = []
            agent1_reward = []
            agent2_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  avg_rew, episode)

        #saving model
        save_dict_list = []
        if save_info:
            for i in range(3):

                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # save gif files
            imageio.mimsave(os.path.join(model_dir,
                                         'episode-{}.gif'.format(episode)),
                            frames,
                            duration=.04)

    env.close()
    logger.close()
    timer.finish()
Пример #17
0
class MADDPG:
    def __init__(self,
                 num_agents,
                 state_size,
                 action_size,
                 hidden_layers,
                 seed,
                 gamma=GAMMA,
                 tau=TAU,
                 lr_actor=LR_ACTOR,
                 lr_critic=LR_CRITIC,
                 weight_decay=WEIGHT_DECAY,
                 buffer_size=BUFFER_SIZE,
                 batch_size=BATCH_SIZE):
        """Initialize MADDPG agent."""
        super(MADDPG, self).__init__()

        self.seed = random.seed(seed)

        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay = weight_decay
        self.buffer_size = buffer_size
        self.batch_size = batch_size

        self.agents = [DDPGAgent(state_size, action_size, hidden_layers, gamma, \
                                 tau, lr_actor, lr_critic, weight_decay, seed) \
                                     for _ in range(num_agents)]

        self.replay_buffer = ReplayBuffer(num_agents, buffer_size, batch_size)

    def act(self, states):
        actions = np.zeros([self.num_agents, self.action_size])
        for index, agent in enumerate(self.agents):
            actions[index, :] = agent.act(states[index])
        return actions

    def step(self, states, actions, rewards, next_states, dones):
        """One step for MADDPG agent, include store the current transition and update parameters."""
        self.replay_buffer.add(states, actions, rewards, next_states, dones)

        if len(self.replay_buffer) > self.batch_size:
            '''
            experiences = self.replay_buffer.sample()
            states_list, _, _, _, _ = experiences
            next_actions_list = [self.agents[idx].target_actor(states).detach() \
                for idx, states in enumerate(states_list)]
            for i in range(self.num_agents):
                self.agents[i].step_learn(experiences, next_actions_list, i)
            '''
            for agent in self.agents:
                experiences = self.replay_buffer.sample()
                agent.step_learn(experiences)

    def save_weights(self):
        for index, agent in enumerate(self.agents):
            torch.save(
                agent.critic.state_dict(),
                'agent{}_critic_trained_with_DDPG.pth'.format(index + 1))
            torch.save(agent.actor.state_dict(),
                       'agent{}_actor_trained_with_DDPG.pth'.format(index + 1))

    def reset(self):
        for agent in self.agents:
            agent.reset()
Пример #18
0
        replay_buffer.add(obs[0], action, rew, new_obs[0], float(done))

        obs = new_obs

        episode_rewards[-1] += rew
        if done:
            episode_end = t
            duration.append(episode_end - episode_start)
            episode_start = t
            obs = env.reset()
            obs = np.expand_dims(np.array(obs), axis=0)
            episode_rewards.append(0.0)
            reset = True

        if t > learning_starts and t % train_freq == 0:
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)
            weights, batch_indxes = np.ones_like(rewards), None
            obses_t, obses_tp1 = tf.constant(obses_t), tf.constant(obses_tp1)
            actions, rewards, dones = tf.constant(
                actions,
                dtype=tf.int64), tf.constant(rewards), tf.constant(dones)
            weights = tf.constant(weights)

            td_errors = agent.train(obses_t, actions, rewards, obses_tp1,
                                    dones, weights)

        if t > learning_starts and t % target_network_update_freq == 0:
            # Update target network periodically.
            agent.update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
Пример #19
0
class MADDPG():
    def __init__(self, num_agents, state_size, action_size, random_seed):
        super(MADDPG, self).__init__()

        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.random_seed = random_seed

        self.maddpg_agent = [
            Agent(self.state_size, self.action_size,
                  self.num_agents * self.state_size,
                  self.num_agents * self.action_size, self.random_seed)
            for i in range(self.num_agents)
        ]

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        self.noise_amplitud = 1
        self.noise_reduction = 0.9995
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        self.t_step += 1
        if len(self.memory) > BATCH_SIZE and self.t_step % UPDATE_EVERY == 0:
            # Learn, if enough samples are available in memory
            for _ in range(round(UPDATE_AMOUNT)):
                for agent in range(self.num_agents):
                    experiences = self.memory.sample()
                    self.learn(experiences, agent, GAMMA)
                self.update_targets()

    def act(self, states):
        """get actions from all agents in the MADDPG object"""
        if self.t_step < NOISE_START:
            noise_amplitud = 0
        else:
            noise_amplitud = self.noise_amplitud
            self.noise_amplitud = max(
                self.noise_amplitud * self.noise_reduction, 0.1)

        actions = np.array([
            agent.act(state, noise_amplitud)
            for agent, state in zip(self.maddpg_agent, states)
        ])

        return actions

    def target_actors(self, states):
        target_actions = torch.cat([
            agent.actor_target(states[:, i, :])
            for i, agent in enumerate(self.maddpg_agent)
        ],
                                   dim=1)
        return target_actions

    def actors(self, states):
        actions = torch.cat([
            agent.actor(states[:, i, :])
            for i, agent in enumerate(self.maddpg_agent)
        ],
                            dim=1)
        return actions

    def learn(self, experiences, agent_number, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        agent = self.maddpg_agent[agent_number]

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        target_actions_full = self.target_actors(next_states)
        next_states_full = next_states.view(-1,
                                            self.num_agents * self.state_size)
        #         target_critic_input = torch.cat((next_states_full,target_actions_full), dim = 1)

        Q_targets_next = agent.critic_target(next_states_full,
                                             target_actions_full)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards[:, agent_number].view(
            -1, 1) + (gamma * Q_targets_next *
                      (1 - dones[:, agent_number].view(-1, 1)))

        # Compute critic loss
        actions_full = actions.view(-1, self.action_size * self.num_agents)
        states_full = states.view(-1, self.num_agents * self.state_size)
        #         critic_input = torch.cat((states_full,actions_full), dim = 1)

        Q_expected = agent.critic(states_full, actions_full)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        #         critic_loss = huber_loss(Q_expected, Q_targets.detach())

        # Minimize the loss
        agent.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(agent.critic.parameters(), 1)
        agent.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_full_pred = self.actors(states)
        #         critic_input_loss = torch.cat((states_batch, actions_full), dim = 1)
        actor_loss = -agent.critic(states_full, actions_full_pred).mean()

        # Minimize the loss
        agent.actor_optimizer.zero_grad()
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(agent.actor.parameters(), 1)
        agent.actor_optimizer.step()

    def update_targets(self):
        """soft update target networks"""
        for agent in self.maddpg_agent:
            self.soft_update(agent.actor, agent.actor_target, TAU)
            self.soft_update(agent.critic, agent.critic_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def reset(self):
        for ddpg_agent in self.maddpg_agent:
            ddpg_agent.noise.reset()
Пример #20
0
def run(config):
    data_folder = Path(config.data_path)
    building_attributes = data_folder / 'building_attributes.json'
    solar_profile = data_folder / 'solar_generation_1kW.csv'
    building_state_actions = 'buildings_state_action_space.json'
    # building_ids = ["Building_" + str(i) for i in range(1, config.num_buildings + 1)]
    config.num_buildings = 6

    # customized log directory
    hidden = config.hidden_dim
    lr = config.lr
    tau = config.tau
    gamma = config.gamma
    batch_size = config.batch_size
    buffer_length = config.buffer_length
    to_print = lambda x: str(x)
    log_path = "log"+"_hidden"+to_print(hidden)+"_lr"+to_print(lr)+"_tau"+to_print(tau)+"_gamma"+to_print(gamma)+\
                "_batch_size"+to_print(batch_size)+"_buffer_length"+to_print(buffer_length)+"_TIME_PERIOD_1008_MAXACTION_25"+"/"

    logger = SummaryWriter(log_dir=log_path)
    # TODO fix here
    building_ids = ["Building_" + str(i)
                    for i in [1, 2, 5, 6, 7, 8]]  #[1,2,5,6,7,8]
    env = CityLearn(building_attributes,
                    solar_profile,
                    building_ids,
                    buildings_states_actions=building_state_actions,
                    cost_function=[
                        'ramping', '1-load_factor', 'peak_to_valley_ratio',
                        'peak_demand', 'net_electricity_consumption'
                    ])
    observations_spaces, actions_spaces = env.get_state_action_spaces()

    # Instantiating the control agent(s)
    if config.agent_alg == 'MADDPG':
        agents = MA_DDPG(observations_spaces,
                         actions_spaces,
                         hyper_params=vars(config))
    else:
        raise NotImplementedError

    k, c = 0, 0
    cost, cum_reward = {}, {}
    buffer = ReplayBuffer(max_steps=config.buffer_length,
                          num_agents=config.num_buildings,
                          obs_dims=[s.shape[0] for s in observations_spaces],
                          ac_dims=[a.shape[0] for a in actions_spaces])
    # TODO: store np or tensor in buffer?
    start = time.time()
    for e in range(config.n_episodes):
        cum_reward[e] = 0
        rewards = []
        state = env.reset()
        statecast = lambda x: [torch.FloatTensor(s) for s in x]
        done = False
        ss = 0
        while not done:
            if k % (40000 * 4) == 0:
                print('hour: ' + str(k) + ' of ' +
                      str(TIME_PERIOD * config.n_episodes))
            action = agents.select_action(statecast(state), explore=False)
            action = [a.detach().numpy() for a in action]
            # if batch norm:
            action = [np.squeeze(a, axis=0) for a in action]
            ss += 1
            #print("action is ", action)
            #print(action[0].shape)
            #raise NotImplementedError
            next_state, reward, done, _ = env.step(action)
            reward = reward_function(
                reward)  # See comments in reward_function.py
            #buffer_reward = [-r for r in reward]
            # agents.add_to_buffer()
            buffer.push(statecast(state), action, reward,
                        statecast(next_state), done)
            # if (len(buffer) >= config.batch_size and
            #         (e % config.steps_per_update) < config.n_rollout_threads):
            if len(buffer) >= config.batch_size:
                if USE_CUDA:
                    agents.to_train(device='gpu')
                else:
                    agents.to_train(device='cpu')
                for a_i in range(agents.n_buildings):
                    sample = buffer.sample(config.batch_size, to_gpu=USE_CUDA)
                    agents.update(sample,
                                  a_i,
                                  logger=logger,
                                  global_step=e * TIME_PERIOD + ss)
            logger.add_scalar(tag='net electric consumption',
                              scalar_value=env.net_electric_consumption[-1],
                              global_step=e * TIME_PERIOD + ss)
            logger.add_scalar(tag='env cost total',
                              scalar_value=env.cost()['total'],
                              global_step=e * TIME_PERIOD + ss)
            logger.add_scalar(tag="1 load factor",
                              scalar_value=env.cost()['1-load_factor'],
                              global_step=e * TIME_PERIOD + ss)
            logger.add_scalar(tag="peak to valley ratio",
                              scalar_value=env.cost()['peak_to_valley_ratio'],
                              global_step=e * TIME_PERIOD + ss)
            logger.add_scalar(tag="peak demand",
                              scalar_value=env.cost()['peak_demand'],
                              global_step=e * TIME_PERIOD + ss)
            logger.add_scalar(
                tag="net energy consumption",
                scalar_value=env.cost()['net_electricity_consumption'],
                global_step=e * TIME_PERIOD + ss)
            net_energy_consumption_wo_storage = env.net_electric_consumption[
                -1] + env.electric_generation[
                    -1] - env.electric_consumption_cooling_storage[
                        -1] - env.electric_consumption_dhw_storage[-1]
            logger.add_scalar(tag="net energy consumption without storage",
                              scalar_value=net_energy_consumption_wo_storage,
                              global_step=e * TIME_PERIOD + ss)

            for id, r in enumerate(reward):
                logger.add_scalar(tag="agent {} reward ".format(id),
                                  scalar_value=r,
                                  global_step=e * TIME_PERIOD + ss)

            state = next_state
            cum_reward[e] += reward[0]
            k += 1
            cur_time = time.time()
            # print("average time : {}s/iteration at iteration {}".format((cur_time - start) / (60.0 * k), k))
        cost[e] = env.cost()
        if c % 1 == 0:
            print(cost[e])
        # add env total cost and reward logger
        logger.add_scalar(tag='env cost total final',
                          scalar_value=env.cost()['total'],
                          global_step=e)
        logger.add_scalar(tag="1 load factor final",
                          scalar_value=env.cost()['1-load_factor'],
                          global_step=e)
        logger.add_scalar(tag="peak to valley ratio final",
                          scalar_value=env.cost()['peak_to_valley_ratio'],
                          global_step=e)
        logger.add_scalar(tag="peak demand final",
                          scalar_value=env.cost()['peak_demand'],
                          global_step=e)
        logger.add_scalar(
            tag="net energy consumption final",
            scalar_value=env.cost()['net_electricity_consumption'],
            global_step=e)
        net_energy_consumption_wo_storage = env.net_electric_consumption[
            -1] + env.electric_generation[
                -1] - env.electric_consumption_cooling_storage[
                    -1] - env.electric_consumption_dhw_storage[-1]
        logger.add_scalar(tag="net energy consumption without storage",
                          scalar_value=net_energy_consumption_wo_storage,
                          global_step=e)
        c += 1
        rewards.append(reward)

    end = time.time()
    print((end - start) / 60.0)
Пример #21
0
def main():
    env_info = env.reset(train_mode=False)[brain_name]
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)

    # size of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]

    seeding()
    # number of parallel agents
    #parallel_envs = num_agents
    # number of training episodes.
    # change this to higher number to experiment. say 30000.

    number_of_episodes = 10000
    update_actor_after = 100
    update_actor_every = 2
    episode_length = 100
    batchsize = 100
    # how many episodes to save policy and gif
    save_interval = 1000
    t = 0

    LR_ACTOR = 1e-5
    LR_CRITIC = 3e-3

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 1.0
    noise_reduction = 0.999999

    # how many episodes before update
    episode_per_update = 1
    no_of_updates_perTime = 1

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    #torch.set_num_threads(parallel_envs)
    #env = envs.make_parallel_env(parallel_envs)

    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(10 * episode_length))

    # initialize policy and critic
    maddpg = MADDPG(lr_actor=LR_ACTOR, lr_critic=LR_CRITIC)
    #logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []
    #agent2_reward = []

    # training loop
    # show progressbar
    import progressbar as pb
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    # use keep_awake to keep workspace from disconnecting
    for episode in range(0, number_of_episodes):

        timer.update(episode)

        env_info = env.reset(
            train_mode=False)[brain_name]  # reset the environment
        states = env_info.vector_observations  # get the current state (for each agent)
        scores = np.zeros(num_agents)  # initialize the score (for each agent)
        reward_this_episode = np.zeros((1, num_agents))

        #all_obs = env.reset() #
        obs = states
        obs_full = np.concatenate((states[0], states[1]))

        #for calculating rewards for this particular episode - addition of all time steps

        # save info or not
        save_info = ((episode) % save_interval < 1
                     or episode == number_of_episodes - 1)
        tmax = 0

        #resetting noise
        for i in range(num_agents):
            maddpg.maddpg_agent[i].noise.reset()

        for episode_t in range(episode_length):

            t += 1

            update_act = True if (episode > update_actor_after or episode %
                                  update_actor_every == 0) else False
            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            actions = maddpg.act(transpose_to_tensorAsitis(obs),
                                 noise=noise,
                                 batch=False)
            noise *= noise_reduction

            actions_array = torch.stack(actions).cpu().detach().numpy()

            # transpose the list of list
            # flip the first two indices
            # input to step requires the first index to correspond to number of parallel agents
            actions_for_env = np.rollaxis(actions_array, 1)

            # step forward one frame
            env_info = env.step(actions_for_env)[brain_name]

            next_states = env_info.vector_observations  # get next state (for each agent)
            rewards = env_info.rewards  # get reward (for each agent)
            dones = env_info.local_done  # see if episode finished
            scores += env_info.rewards

            rewards_for_env = np.hstack(rewards)

            obs = states
            obs_full = np.concatenate((states[0], states[1]))
            next_obs = next_states
            next_obs_full = np.concatenate((next_states[0], next_states[1]))
            # add data to buffer
            transition = (np.array([obs]), np.array([obs_full]),
                          np.array([actions_for_env]),
                          np.array([rewards_for_env]), np.array([next_obs]),
                          np.array([next_obs_full]),
                          np.array([dones], dtype='float'))
            buffer.push(transition)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

            # update once after every episode_per_update
            if len(buffer) > batchsize and episode % episode_per_update == 0:
                for _ in range(no_of_updates_perTime):
                    for a_i in range(num_agents):
                        samples = buffer.sample(batchsize)
                        #updating the weights of the n/w
                        maddpg.update(samples, a_i, update_actor=update_act)
                    maddpg.update_targets(
                    )  #soft update the target network towards the actual networks

            if np.any(dones):
                # if the episode is done the loop is break to the next episode
                break

        for i in range(num_agents):
            agent0_reward.append(reward_this_episode[0, 0])
            agent1_reward.append(reward_this_episode[0, 1])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            avg_rewards = [np.mean(agent0_reward), np.mean(agent1_reward)]
            agent0_reward = []
            agent1_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                #logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode)
                print('agent%i/mean_episode_rewards' % a_i, avg_rew, episode)

        #saving model
        save_dict_list = []
        if save_info:
            for i in range(num_agents):

                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # save gif files
            #imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)),
            #frames, duration=.04)
    timer.finish()
Пример #22
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, memory=None, random_seed=0):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        if memory is not None:
            self.memory = memory
        else:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        if add_noise:
            action += self.noise.sample()
        self.actor_local.train()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #23
0
class MADDPGAgent:
    """Interacts and learns from the environment using multiple DDPG agents"""
    def __init__(self):
        """Initialize a MADDPG Agent object."""
        super(MADDPGAgent, self).__init__()
        self.config = Config.getInstance()
        self.action_num = self.config.action_size * self.config.num_agents
        self.t_step = 0

        self.maddpg_agent = [
            DDPGAgent() for _ in range(self.config.num_agents)
        ]

        self.memory = ReplayBuffer()

    def get_actors(self):
        """get actors of all the agents in the MADDPG object"""
        actors = [ddpg_agent.actor for ddpg_agent in self.maddpg_agent]
        return actors

    # def get_target_actors(self):
    #     """get target_actors of all the agents in the MADDPG object"""
    #     target_actors = [
    #         ddpg_agent.target_actor for ddpg_agent in self.maddpg_agent]
    #     return target_actors

    def act(self, obs_all_agents, noise=0.0):
        """get actions from all agents in the MADDPG object"""
        actions = [
            agent.act(obs, noise)
            for agent, obs in zip(self.maddpg_agent, obs_all_agents)
        ]
        return np.concatenate(actions)

    def update_act(self, obs_all_agents, agent_num, noise_decay_parameter=0.0):
        """
        get target network actions from all the agents in the MADDPG object
        """
        actions_ = []
        for a_i, ddpg_agent in enumerate(self.maddpg_agent):
            obs = obs_all_agents[:, a_i, :].to(self.config.device)
            acn = ddpg_agent.actor(
                obs) + noise_decay_parameter * ddpg_agent.noise.sample()
            if a_i != agent_num:
                acn = acn.detach()
            actions_.append(acn)
        return actions_

    def target_act(self, obs_all_agents, noise=0.0):
        """
        get target network actions from all the agents in the MADDPG object
        """
        target_actions = [
            ddpg_agent.target_act(obs_all_agents[:, a_i, :], noise)
            for a_i, ddpg_agent in enumerate(self.maddpg_agent)
        ]
        return target_actions

    def step(self, _states, _actions, _rewards, _next_states, _dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        states_full = np.reshape(_states, newshape=(-1))
        next_states_full = np.reshape(_next_states, newshape=(-1))
        self.memory.add(_states, states_full, _actions, _rewards, _next_states,
                        next_states_full, _dones)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.config.update_every

        if self.t_step == 0:
            if len(self.memory) > self.config.batch_size:
                for a_i in range(self.config.num_agents):
                    samples = self.memory.sample()
                    self.update(samples, a_i)
                self.update_targets()

    def update_critic(self, samples, agent_number):
        """Update critic weights"""
        states, states_full, actions, rewards, next_states, next_states_full, dones = samples
        agent = self.maddpg_agent[agent_number]
        agent.critic_optimizer.zero_grad()
        # ---------------------------- update critic ---------------------- #
        actions_next = self.target_act(next_states)
        actions_next = torch.cat(actions_next, dim=1)

        Q_target_next = agent.target_critic(next_states_full, actions_next)
        Q_targets = rewards[:, agent_number].view(-1, 1) + self.config.gamma * \
            Q_target_next * (1 - dones[:, agent_number].view(-1, 1))
        Q_expected = agent.critic(states_full,
                                  actions.reshape(-1, self.action_num))
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        critic_loss.backward()
        agent.critic_optimizer.step()

    def update_actor(self, samples, agent_number):
        """Update actor weights"""
        states, states_full, actions, rewards, next_states, next_states_full, dones = samples
        agent = self.maddpg_agent[agent_number]

        agent.actor_optimizer.zero_grad()
        actions_pred = self.update_act(states, agent_number)
        actions_pred = torch.cat(actions_pred, dim=1)
        actor_loss = -agent.critic(states_full, actions_pred).mean()
        actor_loss.backward()
        agent.actor_optimizer.step()

    def update(self, samples, agent_number):
        """update the critics and actors of all the agents """
        # ---------------------------- update critic ---------------------- #
        self.update_critic(samples, agent_number)

        # ---------------------------- update actor ------------------------- #
        self.update_actor(samples, agent_number)

    def update_targets(self):
        """soft update targets"""
        for ddpg_agent in self.maddpg_agent:
            soft_update(ddpg_agent.target_actor, ddpg_agent.actor,
                        self.config.tau)
            soft_update(ddpg_agent.target_critic, ddpg_agent.critic,
                        self.config.tau)

    def reset(self):
        """Resets weight of all agents"""
        for ddpg_agent in self.maddpg_agent:
            ddpg_agent.reset()
Пример #24
0
class TD3:
    def __init__(self, config: Config):
        self.config = config
        self.is_training = True
        # self.buffer = deque(maxlen=self.config.max_buff)
        self.buffer = ReplayBuffer(self.config.max_buff)

        self.actor = Actor(self.config.state_dim, self.config.action_dim,
                           self.config.max_action)
        self.actor_target = Actor(self.config.state_dim,
                                  self.config.action_dim,
                                  self.config.max_action)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = Adam(self.actor.parameters(),
                                    lr=self.config.learning_rate)

        self.critic_1 = Critic(self.config.state_dim, self.config.action_dim)
        self.critic_1_target = Critic(self.config.state_dim,
                                      self.config.action_dim)
        self.critic_1_target.load_state_dict(self.critic_1.state_dict())
        self.critic_1_optimizer = Adam(self.critic_1.parameters(),
                                       lr=self.config.learning_rate)

        self.critic_2 = Critic(self.config.state_dim, self.config.action_dim)
        self.critic_2_target = Critic(self.config.state_dim,
                                      self.config.action_dim)
        self.critic_2_target.load_state_dict(self.critic_2.state_dict())
        self.critic_2_optimizer = Adam(self.critic_2.parameters(),
                                       lr=self.config.learning_rate)

        self.MseLoss = nn.MSELoss()

        if self.config.use_cuda:
            self.cuda()

    def act(self, state):
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        action = self.actor(state)
        return action.cpu().data.numpy().flatten()  #.detach()

    def learning(self, fr, t):

        for i in range(t):
            state, action_, reward, next_state, done = self.buffer.sample(
                self.config.batch_size)

            state = torch.tensor(state, dtype=torch.float).to(device)
            next_state = torch.tensor(next_state, dtype=torch.float).to(device)
            action = torch.tensor(action_, dtype=torch.float).to(device)
            reward = torch.tensor(reward, dtype=torch.float).reshape(
                (-1, 1)).to(device)
            done = torch.tensor(done, dtype=torch.float).reshape(
                (-1, 1)).to(device)
            # reward = torch.FloatTensor(reward).reshape((self.config.batch_size,1)).to(device)
            # done = torch.FloatTensor(done).reshape((self.config.batch_size,1)).to(device)

            # Select next action according to target policy:
            noise = torch.tensor(action_, dtype=torch.float).data.normal_(
                0, self.config.policy_noise).to(device)
            noise = noise.clamp(-self.config.noise_clip,
                                self.config.noise_clip)
            next_action = (self.actor_target(next_state) + noise)
            next_action = next_action.clamp(-self.config.max_action,
                                            self.config.max_action)

            # Compute target Q-value:
            target_Q1 = self.critic_1_target(next_state, next_action)
            target_Q2 = self.critic_2_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + (
                (1 - done) * self.config.gamma * target_Q).detach()

            # Optimize Critic 1:
            current_Q1 = self.critic_1(state, action)
            loss_Q1 = F.mse_loss(current_Q1, target_Q)
            self.critic_1_optimizer.zero_grad()
            loss_Q1.backward()
            self.critic_1_optimizer.step()

            # Optimize Critic 2:
            current_Q2 = self.critic_2(state, action)
            loss_Q2 = F.mse_loss(current_Q2, target_Q)
            self.critic_2_optimizer.zero_grad()
            loss_Q2.backward()
            self.critic_2_optimizer.step()

            # Delayed policy updates:
            if i % self.config.policy_delay == 0:
                # Compute actor loss:
                actor_loss = -self.critic_1(state, self.actor(state)).mean()

                # Optimize the actor
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                # Polyak averaging update:
                for param, target_param in zip(self.actor.parameters(),
                                               self.actor_target.parameters()):
                    target_param.data.copy_(
                        (self.config.polyak * target_param.data) +
                        ((1 - self.config.polyak) * param.data))

                for param, target_param in zip(
                        self.critic_1.parameters(),
                        self.critic_1_target.parameters()):
                    target_param.data.copy_(
                        (self.config.polyak * target_param.data) +
                        ((1 - self.config.polyak) * param.data))

                for param, target_param in zip(
                        self.critic_2.parameters(),
                        self.critic_2_target.parameters()):
                    target_param.data.copy_(
                        (self.config.polyak * target_param.data) +
                        ((1 - self.config.polyak) * param.data))

    def cuda(self):
        self.actor.to(device)
        self.actor_target.to(device)
        self.critic_1.to(device)
        self.critic_1_target.to(device)
        self.critic_2.to(device)
        self.critic_2_target.to(device)

    def load_weights(self, model_path):
        policy = torch.load(model_path)
        if 'actor' in policy:
            self.actor.load_state_dict(policy['actor'])
        else:
            self.actor.load_state_dict(policy)

    def save_model(self, output, name=''):
        torch.save(self.actor.state_dict(), '%s/actor_%s.pkl' % (output, name))

    def save_config(self, output):
        with open(output + '/config.txt', 'w') as f:
            attr_val = get_class_attr_val(self.config)
            for k, v in attr_val.items():
                f.write(str(k) + " = " + str(v) + "\n")

    def save_checkpoint(self, fr, output):
        checkpath = output + '/checkpoint_policy'
        os.makedirs(checkpath, exist_ok=True)
        torch.save(
            {
                'frames': fr,
                'actor': self.actor.state_dict(),
                'critic_1': self.critic_1.state_dict(),
                'critic_2': self.critic_2.state_dict(),
            }, '%s/checkpoint_fr_%d.tar' % (checkpath, fr))

    def load_checkpoint(self, model_path):
        checkpoint = torch.load(model_path)
        fr = checkpoint['frames']
        self.actor.load_state_dict(checkpoint['actor'])
        self.critic_1.load_state_dict(checkpoint['critic_1'])
        self.critic_2.load_state_dict(checkpoint['critic_2'])
        return fr
class DQNAgent:
    """
    DQN Agent, valid for discrete actioin space
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    #loss_fn = nn.MSELoss()
    loss_fn = nn.SmoothL1Loss()
    iter = 0

    def __init__(self,
                 net,
                 o_dim,
                 a_dim,
                 lr=1e-3,
                 batch_size=16,
                 algorithm="ddqn",
                 gamma=0.99,
                 tau=1e-3,
                 buffer_size=int(1e6)):
        """
        o_dim: observation space dim (or # of channels)
        a_dim: action space dimension
        """
        self.o_dim = o_dim
        self.a_dim = a_dim
        self.lr = lr
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.buffer_size = buffer_size

        if algorithm.lower() in ("dqn"):
            self.algorithm = "dqn"
        elif algorithm.lower() in ("ddqn", "double dqn", "doubledqn"):
            self.algorithm = "ddqn"
        else:
            raise TypeError("cannot recognize algorithm")

        self.buffer = ReplayBuffer(buffer_size, batch_size)

        self.online_net = net(o_dim, a_dim).to(self.device)
        self.target_net = net(o_dim, a_dim).to(self.device)

        self.optimizer = optim.Adam(self.online_net.parameters(), lr=lr)

    def get_action(self, state, eps=0.):
        """ Epsilon-greedy action selection """

        if random.random() > eps:
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(
                self.device)

            self.online_net.eval()
            with torch.no_grad():
                action = self.online_net(state_tensor).argmax(1).item()
            self.online_net.train()

            return action
        else:
            return random.choice(np.arange(self.a_dim))

    def update(self, experiences):

        states, actions, rewards, next_states, dones = experiences

        states = torch.FloatTensor(states).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)

        actions = torch.LongTensor(actions).view(-1, 1).to(self.device)
        rewards = torch.FloatTensor(rewards).view(-1, 1).to(self.device)
        dones = torch.FloatTensor(dones).view(-1, 1).to(self.device)

        if self.algorithm == "ddqn":
            max_actions = self.online_net(next_states).max(1)[1].view(-1, 1)
            Q_next = self.target_net(next_states).gather(1, max_actions)

        elif self.algorithm == "dqn":
            Q_next = self.target_net(next_states).max(1)[0].view(-1, 1)
        else:
            raise TypeError("cannot recognize algorithm")

        Q_targets = rewards + self.gamma * Q_next * (1. - dones)
        Q_expected = self.online_net(states).gather(1, actions)

        loss = self.loss_fn(Q_expected, Q_targets.detach())

        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.online_net.parameters(), 10.)
        self.optimizer.step()

    def step(self, state, action, reward, next_state, done):
        self.buffer.push(state, action, reward, next_state, done)
        if len(self.buffer) > self.batch_size:
            experiences = self.buffer.sample()
            self.update(experiences)
            soft_update(self.target_net, self.online_net, self.tau)
            self.iter += 1
Пример #26
0
class CnnDDQNAgent:
    def __init__(self, config: Config):
        self.config = config
        self.is_training = True
        self.buffer = ReplayBuffer(self.config.max_buff)

        self.model = CnnDQN(self.config.state_shape, self.config.action_dim)
        self.target_model = CnnDQN(self.config.state_shape, self.config.action_dim)
        self.target_model.load_state_dict(self.model.state_dict())
        self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate)

        if self.config.use_cuda:
            self.cuda()

    def act(self, state, epsilon=None):
        if epsilon is None: epsilon = self.config.epsilon_min
        if random.random() > epsilon or not self.is_training:
            state = torch.tensor(state, dtype=torch.float).unsqueeze(0)
            if self.config.use_cuda:
                state = state.cuda()
            q_value = self.model.forward(state)
            action = q_value.max(1)[1].item()
        else:
            action = random.randrange(self.config.action_dim)
        return action

    def learning(self, fr):
        s0, a, r, s1, done = self.buffer.sample(self.config.batch_size)

        s0 = torch.tensor(s0, dtype=torch.float)
        s1 = torch.tensor(s1, dtype=torch.float)
        a = torch.tensor(a, dtype=torch.long)
        r = torch.tensor(r, dtype=torch.float)
        done = torch.tensor(done, dtype=torch.float)

        if self.config.use_cuda:
            s0 = s0.cuda()
            s1 = s1.cuda()
            a = a.cuda()
            r = r.cuda()
            done = done.cuda()

        q_values = self.model(s0).cuda()
        next_q_values = self.model(s1).cuda()
        next_q_state_values = self.target_model(s1).cuda()

        q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1)
        next_q_value = next_q_state_values.gather(1, next_q_values.max(1)[1].unsqueeze(1)).squeeze(1)
        expected_q_value = r + self.config.gamma * next_q_value * (1 - done)
        # Notice that detach the expected_q_value
        loss = (q_value - expected_q_value.detach()).pow(2).mean()

        self.model_optim.zero_grad()
        loss.backward()
        self.model_optim.step()

        if fr % self.config.update_tar_interval == 0:
            self.target_model.load_state_dict(self.model.state_dict())

        return loss.item()

    def cuda(self):
        self.model.cuda()
        self.target_model.cuda()

    def load_weights(self, model_path):
        model = torch.load(model_path)
        if 'model' in model:
            self.model.load_state_dict(model['model'])
        else:
            self.model.load_state_dict(model)

    def save_model(self, output, name=''):
        torch.save(self.model.state_dict(), '%s/model_%s.pkl' % (output, name))

    def save_config(self, output):
        with open(output + '/config.txt', 'w') as f:
            attr_val = get_class_attr_val(self.config)
            for k, v in attr_val.items():
                f.write(str(k) + " = " + str(v) + "\n")

    def save_checkpoint(self, fr, output):
        checkpath = output + '/checkpoint_model'
        os.makedirs(checkpath, exist_ok=True)
        torch.save({
            'frames': fr,
            'model': self.model.state_dict()
        }, '%s/checkpoint_fr_%d.tar'% (checkpath, fr))

    def load_checkpoint(self, model_path):
        checkpoint = torch.load(model_path)
        fr = checkpoint['frames']
        self.model.load_state_dict(checkpoint['model'])
        self.target_model.load_state_dict(checkpoint['model'])
        return fr
class DQNAgent:
    def __init__(self, config: Config):
        self.config = config
        self.is_training = True
        self.buffer = ReplayBuffer(self.config.max_buff)

        self.model = DQN(self.config.state_dim, self.config.action_dim).cuda()
        self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate)

        if self.config.use_cuda:
            self.cuda()

    def act(self, state, epsilon=None):
        if epsilon is None: epsilon = self.config.epsilon_min
        if random.random() > epsilon or not self.is_training:
            state = torch.tensor(state, dtype=torch.float).unsqueeze(0)
            if self.config.use_cuda:
                state = state.cuda()
            q_value = self.model.forward(state)
            action = q_value.max(1)[1].item()
        else:
            action = random.randrange(self.config.action_dim)
        return action

    def learning(self, fr):
        s0, a, r, s1, done = self.buffer.sample(self.config.batch_size)

        s0 = torch.tensor(s0, dtype=torch.float)
        s1 = torch.tensor(s1, dtype=torch.float)
        a = torch.tensor(a, dtype=torch.long)
        r = torch.tensor(r, dtype=torch.float)
        done = torch.tensor(done, dtype=torch.float)

        if self.config.use_cuda:
            s0 = s0.cuda()
            s1 = s1.cuda()
            a = a.cuda()
            r = r.cuda()
            done = done.cuda()

        q_values = self.model(s0).cuda()
        next_q_values = self.model(s1).cuda()
        next_q_value = next_q_values.max(1)[0]

        q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1)
        expected_q_value = r + self.config.gamma * next_q_value * (1 - done)
        # Notice that detach the expected_q_value
        loss = (q_value - expected_q_value.detach()).pow(2).mean()

        self.model_optim.zero_grad()
        loss.backward()
        self.model_optim.step()


        return loss.item()

    def cuda(self):
        self.model.cuda()

    def load_weights(self, model_path):
        if model_path is None: return
        self.model.load_state_dict(torch.load(model_path))

    def save_model(self, output, tag=''):
        torch.save(self.model.state_dict(), '%s/model_%s.pkl' % (output, tag))

    def save_config(self, output):
        with open(output + '/config.txt', 'w') as f:
            attr_val = get_class_attr_val(self.config)
            for k, v in attr_val.items():
                f.write(str(k) + " = " + str(v) + "\n")
Пример #28
0
def train_main(exp_prefix="",
               fc_units=[128, 64, 64],
               env_list=[],
               num_envs=10,
               num_obstacls_ratio=[0.2, 0.3, 0.3, 0.2],
               n_step=1,
               max_episodes=10000,
               max_steps=120,
               per_num_envs=8,
               replay_buffer_len=400,
               no_replay=False,
               batch_size=64,
               learning_rate=1e-4,
               epsilon_min=0.05,
               epsilon_max=0.10,
               gamma=0.98,
               without_map_info=False,
               save_interval=1000,
               show=False):
    # create envs
    if len(env_list) == 0:
        env_list = create_or_load_envs(num_envs, num_obstacls_ratio)
    # create model
    if without_map_info:
        state_dims = 2 + 1
    else:
        state_dims = 4 * (2 + 2) + 6 + 2 + 2
    act_dims = 5
    model = DQNModel(state_dims=state_dims,
                     act_dims=act_dims,
                     fc_units=fc_units)
    print("create model done")
    # optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
    # create replay buffer
    buffer = ReplayBuffer(replay_buffer_len)
    print("create buffer done")

    # construct save path suffix
    weight_dir = os.path.join("weights", exp_prefix)
    dir_util.mkpath(weight_dir)
    log_dir = os.path.join("logs", exp_prefix)
    dir_util.mkpath(log_dir)
    summary_writer = tf.summary.create_file_writer(log_dir)

    # run simulations
    mean_loss_vals = []
    mean_ep_rewards = []
    last_save_ep_idx = 0
    for ep in range(max_episodes // per_num_envs):
        if no_replay:
            buffer.clear()
        num_new_samples = 0
        ep_rewards = []
        # randomly select an env and run rollout
        envs = np.random.choice(env_list, size=(per_num_envs))
        env_indices = np.random.randint(len(env_list), size=(per_num_envs))
        for roll_idx, env_idx in enumerate(env_indices):
            env = env_list[env_idx]
            episode_index = ep * per_num_envs + roll_idx
            epsilon = epsilon_max - (
                epsilon_max - epsilon_min) / max_episodes * episode_index
            ship_state_trace, input_states, action_list, reward_list, done_list, is_random_act_list, qvals = run_one_episodes(
                env, model, epsilon, max_steps, without_map_info)
            # td_errors = (reward_list + qvals[1:] * gamma) - qvals[:-1]
            td_errors = get_n_step_estimated_qvals(reward_list, qvals[1:],
                                                   gamma, n_step) - qvals[:-1]
            buffer.add_items(input_states, action_list, reward_list, done_list,
                             td_errors)
            num_new_samples += len(input_states)
            ep_rewards.append(np.sum(reward_list))
            print(
                "episode {:4d}, env-{:03d}, epsilon: {:4.2f}, episode length: {:3d}, ep_reward: {:8.2f}"
                .format(episode_index, env_idx, epsilon, len(input_states),
                        np.sum(reward_list)))
            tot_ep_reward = np.sum(reward_list)
            avg_ep_reward = np.mean(reward_list)
            with summary_writer.as_default():
                tf.summary.scalar('tot_ep_reward_trn',
                                  tot_ep_reward,
                                  step=episode_index)
                tf.summary.scalar('avg_ep_reward_trn',
                                  avg_ep_reward,
                                  step=episode_index)
            if episode_index % 100 == 0:
                # run an evaluation
                (eval_ship_state_trace, eval_input_states, eval_action_list,
                 eval_reward_list, eval_done_list, eval_is_random_act_list,
                 eval_qval_list) = run_one_episodes(env, model, 0, max_steps,
                                                    without_map_info)
                # log episode reward
                with summary_writer.as_default():
                    eval_tot_ep_reward = np.sum(eval_reward_list)
                    eval_avg_ep_reward = np.mean(eval_reward_list)
                    tf.summary.scalar('tot_ep_reward_evl',
                                      eval_tot_ep_reward,
                                      step=episode_index)
                    tf.summary.scalar('avg_ep_reward_evl',
                                      eval_avg_ep_reward,
                                      step=episode_index)
                # eval the loss
                eval_states_curr = np.array(eval_input_states[:-1])
                eval_states_next = np.array(eval_input_states[1:])
                eval_qvals_next = model(eval_states_next,
                                        training=False).numpy()
                eval_qvals_next_max = np.amax(
                    eval_qvals_next, axis=1) * (1 - np.array(eval_done_list))
                eval_qvals_esti = get_n_step_estimated_qvals(
                    eval_reward_list, eval_qvals_next_max, gamma, n_step)
                # to tensor
                eval_states_curr = tf.convert_to_tensor(
                    eval_states_curr, tf.float32)
                eval_action_list_tf = tf.convert_to_tensor(eval_action_list)
                eval_qvals_esti = tf.convert_to_tensor(eval_qvals_esti,
                                                       tf.float32)
                # eval to get loss
                eval_loss = eval_step_v0(model, eval_states_curr,
                                         eval_action_list_tf,
                                         eval_qvals_esti).numpy()
                with summary_writer.as_default():
                    tf.summary.scalar('loss_evl',
                                      eval_loss,
                                      step=episode_index)
                # draw map and state trace
                env.show(eval_ship_state_trace,
                         np.sum(eval_reward_list),
                         eval_loss,
                         eval_action_list,
                         eval_is_random_act_list,
                         save_path="pictures",
                         prefix=exp_prefix,
                         count=episode_index)
        # run update
        avg_ep_reward = float(np.mean(ep_rewards))
        mean_ep_rewards.append(avg_ep_reward)
        curr_update_loss_vals = []
        if no_replay:
            num_updates = 1
        else:
            num_updates = max(
                1,
                min(num_new_samples, replay_buffer_len) // batch_size)
        for _ in range(num_updates):
            # get qvals of next states
            if no_replay:
                batch_size = max(1, int(num_new_samples *
                                        0.8))  # overwrite batch_size
            states_curr, states_next, actions, rewards, dones = buffer.sample(
                batch_size)
            states_next = tf.convert_to_tensor(states_next, tf.float32)
            qvals_next = model(states_next, training=False).numpy()
            qvals_next = np.amax(qvals_next, axis=1) * (1 - dones)
            qvals_esti = get_n_step_estimated_qvals(rewards, qvals_next, gamma,
                                                    n_step)
            # to tensor
            states_curr = tf.convert_to_tensor(states_curr, tf.float32)
            actions = tf.convert_to_tensor(actions)
            qvals_esti = tf.convert_to_tensor(qvals_esti, tf.float32)
            # do an update
            loss_trn = train_step_v0(model, optimizer, states_curr, actions,
                                     qvals_esti).numpy()
            with summary_writer.as_default():
                tf.summary.scalar('loss_trn', loss_trn, step=episode_index)
            curr_update_loss_vals.append(loss_trn)
            print("episode {:4d}, bs: {:4d}, loss_trn: {:6.2f}".format(
                episode_index, batch_size, loss_trn))
        mean_loss_vals.append(float(np.mean(curr_update_loss_vals)))

        # draw loss
        if ep > 0 and ep % 10 == 0:
            draw_vals(mean_ep_rewards,
                      mean_loss_vals,
                      per_num_envs,
                      exp_prefix=exp_prefix)
            # save to file for further use
            json.dump([mean_loss_vals, mean_ep_rewards],
                      open("logs/{}_logs_info.json".format(exp_prefix), "w"))

        # Save the weights using the `checkpoint_path` format
        if (episode_index - last_save_ep_idx) > save_interval:
            save_path = os.path.join(
                weight_dir, "weights_{:05d}.ckpt".format(episode_index))
            model.save_weights(save_path)
            last_save_ep_idx = episode_index
            print("episode-{}, save weights to: {}".format(
                episode_index, save_path))
Пример #29
0
class CnnDDQNAgent:
    def __init__(self, config: Config):
        self.config = config
        self.is_training = True
        if self.config.prioritized_replay:
            self.buffer = PrioritizedReplayBuffer(
                self.config.max_buff,
                alpha=self.config.prioritized_replay_alpha)
            if self.config.prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = self.config.frames
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=self.config.prioritized_replay_beta0,
                final_p=1.0)
        else:
            self.buffer = ReplayBuffer(self.config.max_buff)
            self.beta_schedule = None

        self.model = CnnDQN(self.config.state_shape, self.config.action_dim)
        self.target_model = CnnDQN(self.config.state_shape,
                                   self.config.action_dim)
        self.target_model.load_state_dict(self.model.state_dict())
        self.model_optim = Adam(self.model.parameters(),
                                lr=self.config.learning_rate)

        if self.config.use_cuda:
            self.cuda()

    def act(self, state, epsilon=None):
        if epsilon is None: epsilon = self.config.epsilon_min
        if random.random() > epsilon or not self.is_training:
            state = torch.tensor(state, dtype=torch.float).unsqueeze(0)
            if self.config.use_cuda:
                state = state.cuda()
            q_value = self.model.forward(state)
            action = q_value.max(1)[1].item()
        else:
            action = random.randrange(self.config.action_dim)
        return action

    def learning(self, fr):

        if self.config.prioritized_replay:
            experience = self.buffer.sample(self.config.batch_size,
                                            beta=self.beta_schedule.value(fr))
            (s0, a, r, s1, done, weights, batch_idxes) = experience
        else:
            s0, a, r, s1, done = self.buffer.sample(self.config.batch_size)
            weights, batch_idxes = np.ones_like(r), None

        s0 = torch.tensor(s0, dtype=torch.float)
        s1 = torch.tensor(s1, dtype=torch.float)
        a = torch.tensor(a, dtype=torch.long)
        r = torch.tensor(r, dtype=torch.float)
        done = torch.tensor(done, dtype=torch.float)
        weights = torch.tensor(weights, dtype=torch.float)

        if self.config.use_cuda:
            s0 = s0.cuda()
            s1 = s1.cuda()
            a = a.cuda()
            r = r.cuda()
            done = done.cuda()
            weights = weights.cuda()

        q_values = self.model(s0).cuda()
        next_q_values = self.model(s1).cuda()
        next_q_state_values = self.target_model(s1).cuda()

        q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1)
        next_q_value = next_q_state_values.gather(
            1,
            next_q_values.max(1)[1].unsqueeze(1)).squeeze(1)
        expected_q_value = r + self.config.gamma * next_q_value * (1 - done)
        td_errors = next_q_value - expected_q_value
        # Notice that detach the expected_q_value
        loss = F.smooth_l1_loss(q_value,
                                expected_q_value.detach(),
                                reduction='none')
        loss = (loss * weights).mean()

        self.model_optim.zero_grad()
        loss.backward()
        self.model_optim.step()

        if self.config.prioritized_replay:
            new_priorities = np.abs(td_errors.detach().cpu().numpy()
                                    ) + self.config.prioritized_replay_eps
            self.buffer.update_priorities(batch_idxes, new_priorities)

        if fr % self.config.update_tar_interval == 0:
            self.target_model.load_state_dict(self.model.state_dict())

        return loss.item()

    def cuda(self):
        self.model.cuda()
        self.target_model.cuda()

    def load_weights(self, model_path):
        model = torch.load(model_path)
        if 'model' in model:
            self.model.load_state_dict(model['model'])
        else:
            self.model.load_state_dict(model)

    def save_model(self, output, name=''):
        torch.save(self.model.state_dict(), '%s/model_%s.pkl' % (output, name))

    def save_config(self, output):
        with open(output + '/config.txt', 'w') as f:
            attr_val = get_class_attr_val(self.config)
            for k, v in attr_val.items():
                f.write(str(k) + " = " + str(v) + "\n")

    def save_checkpoint(self, fr, output):
        checkpath = output + '/checkpoint_model'
        os.makedirs(checkpath, exist_ok=True)
        torch.save({
            'frames': fr,
            'model': self.model.state_dict()
        }, '%s/checkpoint_fr_%d.tar' % (checkpath, fr))

    def load_checkpoint(self, model_path):
        checkpoint = torch.load(model_path)
        fr = checkpoint['frames']
        self.model.load_state_dict(checkpoint['model'])
        self.target_model.load_state_dict(checkpoint['model'])
        return fr
def main():

    seeding()

    number_of_episodes = 20000
    episode_length = 1000
    batchsize = 256
    save_interval = 1000
    rewards_deque = deque(maxlen=100)
    rewards_all = []
    noise = 1.0
    noise_reduction = 1.0

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)
    """ Info about the UnityEnvironment
    brain_name: 'TennisBrain'
    brain: ['brain_name', 'camera_resolutions',
           'num_stacked_vector_observations', 'number_visual_observations',
           'vector_action_descriptions', 'vector_action_space_size',
           'vector_action_space_type', 'vector_observation_space_size',
           'vector_observation_space_type']]
    """

    env = UnityEnvironment(file_name="Tennis.app")
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    buffer = ReplayBuffer(int(1e5))

    # initialize policy and critic
    maddpg = MADDPG()
    logger = SummaryWriter(log_dir=log_path)

    # ------------------------------ training ------------------------------ #
    # show progressbar
    import progressbar as pb
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    for episode in range(1, number_of_episodes + 1):

        timer.update(episode)
        rewards_this_episode = np.zeros((2, ))
        """ Info about the UnityEnvironment
        env_info: ['agents', 'local_done', 'max_reached', 'memories',
                  'previous_text_actions', 'previous_vector_actions', 'rewards',
                  'text_observations', 'vector_observations', 'visual_observations']
        actions: List(num_agents=2, action_size=2)
        states: List((24,), (24,))
        rewards: List(2,)
        dones: List(2,)
        """
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations

        for episode_t in range(episode_length):
            # reset the OUNoise for each agent.
            for i in range(2):
                maddpg.maddpg_agent[i].noise.reset()

            actions = maddpg.act(states, noise=noise)
            env_info = env.step(actions)[brain_name]
            noise *= noise_reduction

            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done

            # add data to buffer
            transition = (states, actions, rewards, next_states, dones)
            buffer.push(transition)

            rewards_this_episode += rewards

            states = next_states

            if any(dones):
                break

        # update the local and target network
        if len(buffer) > batchsize:
            # update the local network
            for _ in range(5):
                for a_i in range(2):
                    samples = buffer.sample(batchsize)
                    maddpg.update(samples, a_i, logger)
            # soft update the target network
            maddpg.update_targets()

        rewards_all.append(rewards_this_episode)
        rewards_deque.append(np.max(rewards_this_episode))
        average_score = np.mean(rewards_deque)

        # --------------------- Logging for TensorBoard --------------------- #
        logger.add_scalars('rewards', {
            'agent0': rewards_this_episode[0],
            'agent1': rewards_this_episode[1]
        }, episode)
        logger.add_scalars('global', {
            'score': np.max(rewards_this_episode),
            'average_score': average_score
        }, episode)
        # -------------------------- Save the model -------------------------- #
        save_dict_list = []

        if episode % save_interval == 0 or average_score >= 0.5:
            for i in range(2):
                save_dict = \
                    {'actor_params' : maddpg.maddpg_agent[i].actor.state_dict(),
                     'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                     'critic_params' : maddpg.maddpg_agent[i].critic.state_dict(),
                     'critic_optim_params' : maddpg.maddpg_agent[i].critic_optimizer.state_dict()}
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            if average_score >= 3.0:
                print('\nEnvironment solved in {} episodes!'.format(episode -
                                                                    100))
                print('\nAverage Score: {:.2f}'.format(average_score))
                break

    env.close()
    logger.close()
    timer.finish()