def _build_model(self):
        # Neural Net for Deep-Q learning Model
        self.fc1 = nn.Linear(self.num_inputs, self.fc1_num)
        self.fc2 = nn.Linear(self.fc1_num, self.fc2_num)
        self.fc3 = nn.Linear(self.fc2_num, self.num_outputs)

        self.model = nn.Sequential(self.fc1, nn.ELU(), self.fc2, nn.ELU(),
                                   self.fc3)

        const.myprint(self.model)
예제 #2
0
 def save(self):
     const.myprint('Saving model to:', self.model_path)
     # save with architecture
     checkpoint = {
         'num_states': self.num_states,
         'num_actions': self.num_actions,
         'num_fc_1': self.num_fc_1,
         'num_fc_2': self.num_fc_2,
         'actor': self.actor.state_dict(),
         'critic': self.critic.state_dict()
     }
     torch.save(checkpoint, self.model_path)
예제 #3
0
    def train(self, with_close=True):

        history = []
        eps = None

        for e in range(self.num_episodes):
            env_info = self.env.reset(
                train_mode=True)[self.brain_name]  # reset the environment
            state = env_info.vector_observations[
                0]  # get the current state (s_t)
            score = 0  # initialize the score

            eps = self._get_glie(eps)  # decay epsilon
            done = False
            t = 0

            while not done:

                # choose a_t using epsilon-greedy policy
                action = self.agent.act(state, eps)

                # take action a_t, observe r_{t+1} and s_{t+1}
                env_info = self.env.step(action)[
                    self.brain_name]  # send the action to the environment
                next_state = env_info.vector_observations[
                    0]  # get the next state
                reward = env_info.rewards[0]  # get the reward
                done = env_info.local_done[0]  # see if episode has finished

                # Memorize new sample, replay, update target network
                self.agent.do_stuff(state, action, reward, next_state, done, t)

                state = next_state
                score += reward
                t += 1

            print("\r -> Episode: {}/{}, score: {}, e: {:.2}".format(
                e + 1, self.num_episodes, score, eps),
                  end='')
            history.append(score)

            if (e + 1) % 100 == 0 or e + 1 == self.num_episodes:
                self.agent.save()

        const.myprint('History:', history)
        utils_plot.plot_history_rolling_mean(history, fp=self.image_path)

        if with_close:
            self.env.close()

        return history
예제 #4
0
 def save(self):
     const.myprint('Saving model to:', self.model_path)
     # save with architecture
     checkpoint = {
         'num_states': self.num_states,
         'num_actions': self.num_actions,
         'gamma': self.gamma,
         'num_fc_actor': self.num_fc_actor,
         'num_fc_critic': self.num_fc_critic,
         'learning_rate': self.model_learning_rate,
         'critic': self.policy.critic.state_dict(),
         'critic_optimizer': self.policy.critic_optimizer.state_dict(),
         'actor': self.policy.actor.state_dict(),
         'actor_optimizer': self.policy.actor_optimizer.state_dict()
     }
     torch.save(checkpoint, self.model_path)
예제 #5
0
    def load(self):
        const.myprint('Loading model from:', self.model_path)
        # load with architecture
        checkpoint = torch.load(self.model_path)
        self.policy = models.TD3(state_dim=checkpoint['num_states'],
                                 action_dim=checkpoint['num_actions'],
                                 max_action=const.max_action,
                                 discount=checkpoint['gamma'],
                                 num_fc_actor=checkpoint['num_fc_actor'],
                                 num_fc_critic=checkpoint['num_fc_critic'],
                                 learning_rate=checkpoint['learning_rate'])
        self.policy.critic.load_state_dict(checkpoint['critic'])
        self.policy.critic_optimizer.load_state_dict(
            checkpoint['critic_optimizer'])
        self.policy.actor.load_state_dict(checkpoint['actor'])
        self.policy.actor_optimizer.load_state_dict(
            checkpoint['actor_optimizer'])

        # change mode (to use only for inference)
        self.policy.actor.eval()
예제 #6
0
    def load(self):
        const.myprint('Loading model from:', self.model_path)
        # load with architecture
        checkpoint = torch.load(self.model_path)
        self.actor = models.Actor(state_dim=checkpoint['num_states'],
                                  action_dim=checkpoint['num_actions'],
                                  num_fc_1=checkpoint['num_fc_1'],
                                  num_fc_2=checkpoint['num_fc_2'],
                                  with_bn=const.with_bn)
        self.actor.load_state_dict(checkpoint['actor'])
        self.critic = models.Critic(state_dim=checkpoint['num_states'],
                                    action_dim=checkpoint['num_actions'],
                                    num_fc_1=checkpoint['num_fc_1'],
                                    num_fc_2=checkpoint['num_fc_2'],
                                    with_bn=const.with_bn)
        self.critic.load_state_dict(checkpoint['critic'])

        # change mode (to use only for inference)
        self.actor.eval()
        self.critic.eval()
예제 #7
0
 def _model_summary(self, model, title='Model'):
     const.myprint("model_summary --> " + title)
     const.myprint()
     const.myprint("Layer_name" + "\t" * 7 + "Number of Parameters")
     const.myprint("=" * 100)
     model_parameters = [
         layer for layer in model.parameters() if layer.requires_grad
     ]
     layer_name = [child for child in model.children()]
     j = 0
     total_params = 0
     for i in layer_name:
         param = 0
         try:
             bias = (i.bias is not None)
         except:
             bias = False
         if not bias:
             param = model_parameters[j].numel() + model_parameters[
                 j + 1].numel()
             j = j + 2
         else:
             param = model_parameters[j].numel()
             j = j + 1
         const.myprint(str(i) + "\t" * 3 + str(param))
         total_params += param
     const.myprint("=" * 100)
     const.myprint(f"Total Params:{total_params}")
예제 #8
0
    def train(self, with_close=True):
        print('Training ...')

        history = []
        rolling_window = deque(maxlen=const.rolling_mean_N)
        solved = False
        best_score = 0.
        best_e = 0
        best_found = False

        for e in range(self.num_episodes):
            env_info = self.env.reset(
                train_mode=True)[self.brain_name]  # reset the environment
            states = env_info.vector_observations  # get the current state (s_t)
            scores = np.zeros(const.num_agents)  # initialize the score
            self.agent_1.reset()
            self.agent_2.reset()

            while True:
                # choose a_t using epsilon-greedy policy
                states_input = states.flatten()
                a_1 = self.agent_1.act(states_input)
                a_2 = self.agent_2.act(states_input)
                actions_input = np.array([a_1, a_2]).flatten()

                # take action a_t, observe r_{t+1} and s_{t+1}
                env_info = self.env.step(actions_input)[
                    self.brain_name]  # send the action to the environment
                next_states = env_info.vector_observations  # get the next state
                rewards = env_info.rewards  # get the reward
                dones = env_info.local_done  # see if episode has finished

                # Memorize new sample, replay, update target network
                next_states_input = next_states.flatten()
                self.agent_1.do_stuff(states_input, actions_input, rewards[0],
                                      next_states_input, dones[0], 0)
                self.agent_2.do_stuff(states_input, actions_input, rewards[1],
                                      next_states_input, dones[1], 1)

                states = next_states
                scores += rewards

                if np.any(dones):
                    break

            score = np.max(
                scores)  # max of scores over all agents for this episode
            rolling_window.append(score)
            history.append(score)
            curr_best_score = np.mean(rolling_window)
            print("\r -> Episode: {}/{}, score: {:.3f}, avg_score: {:.3f}".
                  format(e + 1, self.num_episodes, score, curr_best_score),
                  end='')

            # if (e + 1) % 100 == 0 or e + 1 == self.num_episodes:
            if curr_best_score > best_score or (e + 1 == self.num_episodes
                                                and not best_found):
                best_score = curr_best_score
                best_e = e + 1
                best_found = True
                self.agent_1.save()
                self.agent_2.save()

            if np.mean(rolling_window) >= const.high_score and not solved:
                print(
                    '\nEnv solved in {:d} episodes, avg_score: {:.3f}'.format(
                        e + 1, np.mean(rolling_window)))
                solved = True

        # plot scores
        const.myprint('History:', history)
        utils_plot.plot_history_rolling_mean(history, fp=self.image_path)

        if with_close:
            self.env.close()

        return history, best_e, best_score
예제 #9
0
 def save(self):
     const.myprint('Saving model to:', self.model_path)
     self.model.save_weights(str(self.model_path))
예제 #10
0
 def load(self):
     const.myprint('Loading model from:', self.model_path)
     self.model.load_weights(str(self.model_path))