Exemplo n.º 1
0
def run(args):

    env = MultiEnv(resize=(250, 150))
    env.configure(remotes=args.remotes)

    if args.train_pg:
        from agent.agent_pg import AgentPG
        agent = AgentPG(env, args)
        agent.train()

    if args.train_dqn:
        from agent.agent_dqn import AgentDQN
        agent = AgentDQN(env, args)
        agent.train()

    if args.train_ac:
        from agent.agent_ac import AgentAC
        agent = AgentAC(env, args)
        agent.train()

    if args.train_a2c:
        from agent.agent_a2c import AgentA2C
        agent = AgentA2C(env, args)
        agent.train()

    if args.test_pg:
        from agent.agent_pg import AgentPG
        env = gym.wrappers.Monitor(env,
                                   args.video_dir,
                                   video_callable=lambda x: True,
                                   resume=True)
        agent = AgentPG(env, args)
        test(agent, env, args, total_episodes=1)

    if args.test_dqn:
        from agent.agent_dqn import AgentDQN
        env = gym.wrappers.Monitor(env,
                                   args.video_dir,
                                   video_callable=lambda x: True,
                                   resume=True)
        agent = AgentDQN(env, args)
        test(agent, env, args, total_episodes=1)

    if args.test_ac:
        from agent.agent_ac import AgentAC
        env = gym.wrappers.Monitor(env,
                                   args.video_dir,
                                   video_callable=lambda x: True,
                                   resume=True)
        agent = AgentAC(env, args)
        test(agent, env, args, total_episodes=1)

    if args.test_a2c:
        from agent.agent_a2c import AgentA2C
        env = gym.wrappers.Monitor(env,
                                   args.video_dir,
                                   video_callable=lambda x: True,
                                   resume=True)
        agent = AgentA2C(env, args)
        test(agent, env, args, total_episodes=1)
Exemplo n.º 2
0
    def __init__(self, env, args):

        # Hyperparameters
        self.lr = 7e-4
        self.gamma = 0.9
        self.hidden_size = 512
        self.update_freq = 5
        self.n_processes = args.remotes
        self.seed = 42
        self.max_steps = 1e9
        self.grad_norm = 0.5
        self.entropy_weight = 0.05
        self.eps = np.finfo(np.float32).eps.item()

        #######################    NOTE: You need to implement
        self.recurrent = True  # <- ActorCritic._forward_rnn()
        #######################    Please check a2c/actor_critic.py

        self.display_freq = 1000
        self.save_freq = 1
        self.save_dir = './ckpts/'

        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)

        self.envs = env
        if self.envs == None:
            self.envs = MultiEnv()
            self.envs.configure(remotes=self.n_processes)

        self.device = torch.device("cuda:0" if use_cuda else "cpu")

        observation = self.envs.reset()
        self.obs_shape = np.transpose(observation[0], (2, 0, 1)).shape
        self.act_shape = args.action_space

        self.rollouts = RolloutStorage(self.update_freq, self.n_processes,
                                       self.obs_shape, self.act_shape,
                                       self.hidden_size)
        self.model = ActorCritic(self.obs_shape, self.act_shape,
                                 self.hidden_size,
                                 self.recurrent).to(self.device)
        self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5)

        if args.test_a2c:
            self.load_model('./ckpts/model_1239.pt')

        self.hidden = None
        self.init_game_setting()
Exemplo n.º 3
0
class AgentA2C:
    def __init__(self, env, args):

        # Hyperparameters
        self.lr = 7e-4
        self.gamma = 0.9
        self.hidden_size = 512
        self.update_freq = 5
        self.n_processes = args.remotes
        self.seed = 42
        self.max_steps = 1e9
        self.grad_norm = 0.5
        self.entropy_weight = 0.05
        self.eps = np.finfo(np.float32).eps.item()

        #######################    NOTE: You need to implement
        self.recurrent = True  # <- ActorCritic._forward_rnn()
        #######################    Please check a2c/actor_critic.py

        self.display_freq = 1000
        self.save_freq = 1
        self.save_dir = './ckpts/'

        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)

        self.envs = env
        if self.envs == None:
            self.envs = MultiEnv()
            self.envs.configure(remotes=self.n_processes)

        self.device = torch.device("cuda:0" if use_cuda else "cpu")

        observation = self.envs.reset()
        self.obs_shape = np.transpose(observation[0], (2, 0, 1)).shape
        self.act_shape = args.action_space

        self.rollouts = RolloutStorage(self.update_freq, self.n_processes,
                                       self.obs_shape, self.act_shape,
                                       self.hidden_size)
        self.model = ActorCritic(self.obs_shape, self.act_shape,
                                 self.hidden_size,
                                 self.recurrent).to(self.device)
        self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5)

        if args.test_a2c:
            self.load_model('./ckpts/model_1239.pt')

        self.hidden = None
        self.init_game_setting()

    def _update(self):
        # R_t = reward_t + gamma * R_{t+1}
        with torch.no_grad():
            next_value, _, _ = self.model(self.rollouts.obs[-1],
                                          self.rollouts.hiddens[-1],
                                          self.rollouts.masks[-1])

        self.rollouts.returns[-1] = next_value.detach()
        for step in reversed(range(self.rollouts.rewards.size(0))):
            self.rollouts.returns[step] = self.rollouts.rewards[step] + \
                                            (self.rollouts.returns[step + 1] * \
                                             self.gamma * \
                                             self.rollouts.masks[step + 1])

        # Compute actor critic loss (value_loss, action_loss)
        # OPTIONAL: You can also maxmize entropy to encourage exploration
        # loss = value_loss + action_loss (- entropy_weight * entropy)
        values, action_probs, _ = self.model(
            self.rollouts.obs[:-1].view(-1, self.obs_shape[0],
                                        self.obs_shape[1], self.obs_shape[2]),
            self.rollouts.hiddens[0], self.rollouts.masks[:-1].view(-1, 1))
        distribution = torch.distributions.Categorical(action_probs)
        log_probs = distribution.log_prob(
            self.rollouts.actions.flatten()).flatten()
        returns = self.rollouts.returns[:-1].flatten()
        values = values.flatten()
        value_loss = F.smooth_l1_loss(returns, values)
        advantages = returns - values
        action_loss = -(log_probs * advantages.detach()).mean()
        entropy = distribution.entropy().mean()
        loss = value_loss + action_loss + (-self.entropy_weight * entropy)

        # Update
        self.optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(self.model.parameters(), self.grad_norm)
        self.optimizer.step()

        # Clear rollouts after update (RolloutStorage.reset())
        self.rollouts.reset()

        return loss.item()

    def _step(self, obs, hiddens, masks):
        with torch.no_grad():
            # Sample actions from the output distributions
            # HINT: you can use torch.distributions.Categorical
            values, action_probs, hiddens = self.model(obs, hiddens, masks)
            actions = torch.distributions.Categorical(action_probs).sample()

        transformed_action = multiActionTransform(actions.cpu().numpy())
        obs, rewards, dones, infos = self.envs.step(transformed_action)

        # Store transitions (obs, hiddens, actions, values, rewards, masks)
        # You need to convert arrays to tensors first
        # HINT: masks = (1 - dones)
        obs = torch.from_numpy(obs).to(self.device).permute(0, 3, 1, 2)
        masks = torch.from_numpy(1 - dones).to(self.device)
        rewards = torch.from_numpy(rewards).to(self.device)
        penalty_rewards = (1 - masks) * -10
        rewards = rewards + penalty_rewards.double()

        self.rollouts.insert(obs, hiddens, actions.unsqueeze(1), values,
                             rewards.unsqueeze(1), masks.unsqueeze(1))

    def train(self):

        print(
            '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
        )
        print(
            '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
        )
        print(
            '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~START TRAINING~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
        )
        print(
            '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
        )
        print(
            '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
        )

        running_reward = deque(maxlen=self.update_freq * 2)
        episode_rewards = torch.zeros(self.n_processes, 1).to(self.device)
        total_steps = 0

        # Store first observation
        obs = torch.from_numpy(self.envs.reset()).to(self.device).permute(
            0, 3, 1, 2)
        self.rollouts.obs[0].copy_(obs)
        self.rollouts.to(self.device)

        max_reward = 0.0
        counter = 0
        continual_crash = 0

        while True:
            try:
                # Update once every n-steps
                for step in range(self.update_freq):
                    self._step(self.rollouts.obs[step],
                               self.rollouts.hiddens[step],
                               self.rollouts.masks[step])

                    # Calculate episode rewards
                    episode_rewards += self.rollouts.rewards[step]
                    for r, m in zip(episode_rewards,
                                    self.rollouts.masks[step + 1]):
                        if m == 0:
                            running_reward.append(r.item())
                    episode_rewards *= self.rollouts.masks[step + 1]

                loss = self._update()
                total_steps += self.update_freq * self.n_processes

                # Log & save model
                if len(running_reward) == 0:
                    avg_reward = 0
                else:
                    avg_reward = sum(running_reward) / len(running_reward)

                if total_steps % self.display_freq == 0:
                    print(
                        'Steps: %d/%d | Avg reward: %f | Max reward: %f' %
                        (total_steps, self.max_steps, avg_reward, max_reward))
                    with open('a2c_log.txt', 'a') as fout:
                        fout.write(str(avg_reward) + '\n')

                if total_steps % self.save_freq == 0:
                    self.save_model('model_{}.pt'.format(counter), avg_reward)
                    counter += 1

                if avg_reward > max_reward:
                    max_reward = avg_reward
                    self.save_model('model_max_{}.pt'.format(counter),
                                    max_reward)
                    counter += 1

                if total_steps >= self.max_steps:
                    break

                continual_crash = 0

            except Exception as e:
                continual_crash += 1

                if continual_crash >= 10:
                    print(
                        '============================================================================================================================================'
                    )
                    print(e)
                    print("Crashed 10 times -- stopping u suck")
                    print(
                        '============================================================================================================================================'
                    )

                    raise e
                else:
                    print(
                        '#############################################################################################################################################'
                    )
                    print(e)
                    print("Env crash, making new env")
                    print(
                        '#############################################################################################################################################'
                    )

                    time.sleep(60)
                    self.envs = MultiEnv(resize=(250, 150))
                    self.envs.configure(remotes=self.n_processes)
                    time.sleep(60)

    def save_model(self, filename, max_reward):
        if not os.path.isdir(self.save_dir):
            os.mkdir(self.save_dir)
        print('model saved: ' + filename + ' (' + str(max_reward) + ')')
        torch.save(self.model, os.path.join(self.save_dir, filename))

    def load_model(self, path):
        if use_cuda:
            self.model = torch.load(path)
        else:
            self.model = torch.load(path, map_location='cpu')

    def init_game_setting(self):
        if self.recurrent:
            self.hidden = torch.zeros(1, self.hidden_size).to(self.device)

    def make_action(self, observation, test=False):
        with torch.no_grad():
            observation = torch.from_numpy(observation).float().permute(
                0, 3, 1, 2).to(self.device)
            _, action_prob, hidden = self.model(
                observation, self.hidden,
                torch.ones(1, 1).to(self.device))
            self.hidden = hidden
            action = torch.distributions.Categorical(action_prob).sample()

        return action.cpu().numpy()
Exemplo n.º 4
0
    def train(self):

        print(
            '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
        )
        print(
            '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
        )
        print(
            '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~START TRAINING~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
        )
        print(
            '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
        )
        print(
            '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
        )

        running_reward = deque(maxlen=self.update_freq * 2)
        episode_rewards = torch.zeros(self.n_processes, 1).to(self.device)
        total_steps = 0

        # Store first observation
        obs = torch.from_numpy(self.envs.reset()).to(self.device).permute(
            0, 3, 1, 2)
        self.rollouts.obs[0].copy_(obs)
        self.rollouts.to(self.device)

        max_reward = 0.0
        counter = 0
        continual_crash = 0

        while True:
            try:
                # Update once every n-steps
                for step in range(self.update_freq):
                    self._step(self.rollouts.obs[step],
                               self.rollouts.hiddens[step],
                               self.rollouts.masks[step])

                    # Calculate episode rewards
                    episode_rewards += self.rollouts.rewards[step]
                    for r, m in zip(episode_rewards,
                                    self.rollouts.masks[step + 1]):
                        if m == 0:
                            running_reward.append(r.item())
                    episode_rewards *= self.rollouts.masks[step + 1]

                loss = self._update()
                total_steps += self.update_freq * self.n_processes

                # Log & save model
                if len(running_reward) == 0:
                    avg_reward = 0
                else:
                    avg_reward = sum(running_reward) / len(running_reward)

                if total_steps % self.display_freq == 0:
                    print(
                        'Steps: %d/%d | Avg reward: %f | Max reward: %f' %
                        (total_steps, self.max_steps, avg_reward, max_reward))
                    with open('a2c_log.txt', 'a') as fout:
                        fout.write(str(avg_reward) + '\n')

                if total_steps % self.save_freq == 0:
                    self.save_model('model_{}.pt'.format(counter), avg_reward)
                    counter += 1

                if avg_reward > max_reward:
                    max_reward = avg_reward
                    self.save_model('model_max_{}.pt'.format(counter),
                                    max_reward)
                    counter += 1

                if total_steps >= self.max_steps:
                    break

                continual_crash = 0

            except Exception as e:
                continual_crash += 1

                if continual_crash >= 10:
                    print(
                        '============================================================================================================================================'
                    )
                    print(e)
                    print("Crashed 10 times -- stopping u suck")
                    print(
                        '============================================================================================================================================'
                    )

                    raise e
                else:
                    print(
                        '#############################################################################################################################################'
                    )
                    print(e)
                    print("Env crash, making new env")
                    print(
                        '#############################################################################################################################################'
                    )

                    time.sleep(60)
                    self.envs = MultiEnv(resize=(250, 150))
                    self.envs.configure(remotes=self.n_processes)
                    time.sleep(60)
Exemplo n.º 5
0
class AgentAC(Agent):
    def __init__(self, env, args):
        self.env = env
        self.model = Net(state_dim=args.channels, action_num=args.action_space)
        self.model = self.model.to(device)
        self.grad_norm = 0.5
        self.entropy_weight = 0.05
        self.args = args

        if args.test_ac:
            self.load('ac.cpt')

        # discounted reward
        self.gamma = 0.99
        self.eps = np.finfo(np.float32).eps.item()

        # training hyperparameters
        self.num_episodes = 100000  # total training episodes (actually too large...)
        self.display_freq = 1  # frequency to display training progress

        # optimizer
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=3e-3)

        # saved rewards and actions
        self.rewards, self.prob_value_entropy = [], []

    def save(self, save_path):
        print('save model to', save_path)
        torch.save(self.model.state_dict(), save_path)

    def load(self, load_path):
        print('load model from', load_path)
        # self.model.load_state_dict(torch.load(load_path, map_location='cpu'))
        self.model.load_state_dict(torch.load(load_path))

    def init_game_setting(self):
        self.rewards, self.log_prob, self.entropy, self.value = [], [], [], []

    def make_action(self, state, test=False):
        # Use your model to output distribution over actions and sample from it.

        with torch.no_grad():
            state = torch.from_numpy(state).float().permute(0, 3, 1, 2)
            state = state.to(device)

        if test:
            with torch.no_grad():
                action_probs, __ = self.model(state)
                action = torch.distributions.Categorical(action_probs).sample()

                return action.cpu().numpy()

        else:
            action_probs, value = self.model(state)

            distribution = torch.distributions.Categorical(action_probs)
            action = distribution.sample()
            self.log_prob.append(distribution.log_prob(action))
            self.entropy.append(distribution.entropy())
            self.value.append(value)

            return action.cpu().numpy()

    def update(self):
        # discount your saved reward
        R = 0.0
        returns = []
        for r in self.rewards[::-1]:
            R = int(r) + self.gamma * R
            returns.insert(0, R)
        returns = torch.tensor(returns, device=device)
        action_log_probs = torch.stack(self.log_prob, dim=0)
        self.value = torch.stack(self.value, dim=0)
        entropy = torch.stack(self.entropy, dim=0).mean()
        advantages = returns - self.value
        value_loss = advantages.pow(2).mean()
        action_loss = -(advantages.detach() * action_log_probs).mean()
        loss = value_loss + action_loss - self.entropy_weight * entropy
        # normalize reward
        # discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + self.eps)

        # compute loss
        self.optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(self.model.parameters(), self.grad_norm)

        self.optimizer.step()

    def train(self):
        max_reward = 0.0
        avg_reward = None  # moving average of reward
        continual_crash = 0
        for epoch in range(self.num_episodes):
            try:
                state = self.env.reset()
                self.init_game_setting()
                done = False
                while (not done):
                    if self.args.do_render:
                        self.env.render()

                    action = self.make_action(state)
                    transformed_action = multiActionTransform(action)
                    state, reward, done, _ = self.env.step(transformed_action)

                    self.rewards.append(reward)

                # for logging
                last_reward = np.sum(self.rewards)
                avg_reward = last_reward if not avg_reward else avg_reward * 0.9 + last_reward * 0.1
                with open('ac_log.txt', 'a') as fout:
                    fout.write(str(avg_reward) + '\n')

                # update model
                self.update()

                if epoch % self.display_freq == 0:
                    print(
                        '============================================================================================================================================'
                    )
                    print('Epochs: %d/%d | Avg reward: %f ' %
                          (epoch, self.num_episodes, avg_reward))
                    print(
                        '============================================================================================================================================'
                    )

                if avg_reward > max_reward:
                    max_reward = avg_reward
                    self.save('ac.cpt')

                continual_crash = 0

            except Exception as e:
                continual_crash += 1

                if continual_crash >= 10:
                    print(
                        '============================================================================================================================================'
                    )
                    print(e)
                    print("Crashed 10 times -- stopping u suck")
                    print(
                        '============================================================================================================================================'
                    )

                    raise e
                else:
                    print(
                        '#############################################################################################################################################'
                    )
                    print(e)
                    print("Env crash, making new env")
                    print(
                        '#############################################################################################################################################'
                    )
                    time.sleep(60)
                    self.env = MultiEnv(resize=(250, 150))
                    self.env.configure(remotes=1)
                    time.sleep(60)
Exemplo n.º 6
0
    def train(self):
        max_reward = 0.0
        avg_reward = None  # moving average of reward
        continual_crash = 0
        for epoch in range(self.num_episodes):
            try:
                state = self.env.reset()
                self.init_game_setting()
                done = False
                while (not done):
                    if self.args.do_render:
                        self.env.render()

                    action = self.make_action(state)
                    transformed_action = multiActionTransform(action)
                    state, reward, done, _ = self.env.step(transformed_action)

                    self.rewards.append(reward)

                # for logging
                last_reward = np.sum(self.rewards)
                avg_reward = last_reward if not avg_reward else avg_reward * 0.9 + last_reward * 0.1
                with open('ac_log.txt', 'a') as fout:
                    fout.write(str(avg_reward) + '\n')

                # update model
                self.update()

                if epoch % self.display_freq == 0:
                    print(
                        '============================================================================================================================================'
                    )
                    print('Epochs: %d/%d | Avg reward: %f ' %
                          (epoch, self.num_episodes, avg_reward))
                    print(
                        '============================================================================================================================================'
                    )

                if avg_reward > max_reward:
                    max_reward = avg_reward
                    self.save('ac.cpt')

                continual_crash = 0

            except Exception as e:
                continual_crash += 1

                if continual_crash >= 10:
                    print(
                        '============================================================================================================================================'
                    )
                    print(e)
                    print("Crashed 10 times -- stopping u suck")
                    print(
                        '============================================================================================================================================'
                    )

                    raise e
                else:
                    print(
                        '#############################################################################################################################################'
                    )
                    print(e)
                    print("Env crash, making new env")
                    print(
                        '#############################################################################################################################################'
                    )
                    time.sleep(60)
                    self.env = MultiEnv(resize=(250, 150))
                    self.env.configure(remotes=1)
                    time.sleep(60)