def _build_model(self): # Neural Net for Deep-Q learning Model self.fc1 = nn.Linear(self.num_inputs, self.fc1_num) self.fc2 = nn.Linear(self.fc1_num, self.fc2_num) self.fc3 = nn.Linear(self.fc2_num, self.num_outputs) self.model = nn.Sequential(self.fc1, nn.ELU(), self.fc2, nn.ELU(), self.fc3) const.myprint(self.model)
def save(self): const.myprint('Saving model to:', self.model_path) # save with architecture checkpoint = { 'num_states': self.num_states, 'num_actions': self.num_actions, 'num_fc_1': self.num_fc_1, 'num_fc_2': self.num_fc_2, 'actor': self.actor.state_dict(), 'critic': self.critic.state_dict() } torch.save(checkpoint, self.model_path)
def train(self, with_close=True): history = [] eps = None for e in range(self.num_episodes): env_info = self.env.reset( train_mode=True)[self.brain_name] # reset the environment state = env_info.vector_observations[ 0] # get the current state (s_t) score = 0 # initialize the score eps = self._get_glie(eps) # decay epsilon done = False t = 0 while not done: # choose a_t using epsilon-greedy policy action = self.agent.act(state, eps) # take action a_t, observe r_{t+1} and s_{t+1} env_info = self.env.step(action)[ self.brain_name] # send the action to the environment next_state = env_info.vector_observations[ 0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished # Memorize new sample, replay, update target network self.agent.do_stuff(state, action, reward, next_state, done, t) state = next_state score += reward t += 1 print("\r -> Episode: {}/{}, score: {}, e: {:.2}".format( e + 1, self.num_episodes, score, eps), end='') history.append(score) if (e + 1) % 100 == 0 or e + 1 == self.num_episodes: self.agent.save() const.myprint('History:', history) utils_plot.plot_history_rolling_mean(history, fp=self.image_path) if with_close: self.env.close() return history
def save(self): const.myprint('Saving model to:', self.model_path) # save with architecture checkpoint = { 'num_states': self.num_states, 'num_actions': self.num_actions, 'gamma': self.gamma, 'num_fc_actor': self.num_fc_actor, 'num_fc_critic': self.num_fc_critic, 'learning_rate': self.model_learning_rate, 'critic': self.policy.critic.state_dict(), 'critic_optimizer': self.policy.critic_optimizer.state_dict(), 'actor': self.policy.actor.state_dict(), 'actor_optimizer': self.policy.actor_optimizer.state_dict() } torch.save(checkpoint, self.model_path)
def load(self): const.myprint('Loading model from:', self.model_path) # load with architecture checkpoint = torch.load(self.model_path) self.policy = models.TD3(state_dim=checkpoint['num_states'], action_dim=checkpoint['num_actions'], max_action=const.max_action, discount=checkpoint['gamma'], num_fc_actor=checkpoint['num_fc_actor'], num_fc_critic=checkpoint['num_fc_critic'], learning_rate=checkpoint['learning_rate']) self.policy.critic.load_state_dict(checkpoint['critic']) self.policy.critic_optimizer.load_state_dict( checkpoint['critic_optimizer']) self.policy.actor.load_state_dict(checkpoint['actor']) self.policy.actor_optimizer.load_state_dict( checkpoint['actor_optimizer']) # change mode (to use only for inference) self.policy.actor.eval()
def load(self): const.myprint('Loading model from:', self.model_path) # load with architecture checkpoint = torch.load(self.model_path) self.actor = models.Actor(state_dim=checkpoint['num_states'], action_dim=checkpoint['num_actions'], num_fc_1=checkpoint['num_fc_1'], num_fc_2=checkpoint['num_fc_2'], with_bn=const.with_bn) self.actor.load_state_dict(checkpoint['actor']) self.critic = models.Critic(state_dim=checkpoint['num_states'], action_dim=checkpoint['num_actions'], num_fc_1=checkpoint['num_fc_1'], num_fc_2=checkpoint['num_fc_2'], with_bn=const.with_bn) self.critic.load_state_dict(checkpoint['critic']) # change mode (to use only for inference) self.actor.eval() self.critic.eval()
def _model_summary(self, model, title='Model'): const.myprint("model_summary --> " + title) const.myprint() const.myprint("Layer_name" + "\t" * 7 + "Number of Parameters") const.myprint("=" * 100) model_parameters = [ layer for layer in model.parameters() if layer.requires_grad ] layer_name = [child for child in model.children()] j = 0 total_params = 0 for i in layer_name: param = 0 try: bias = (i.bias is not None) except: bias = False if not bias: param = model_parameters[j].numel() + model_parameters[ j + 1].numel() j = j + 2 else: param = model_parameters[j].numel() j = j + 1 const.myprint(str(i) + "\t" * 3 + str(param)) total_params += param const.myprint("=" * 100) const.myprint(f"Total Params:{total_params}")
def train(self, with_close=True): print('Training ...') history = [] rolling_window = deque(maxlen=const.rolling_mean_N) solved = False best_score = 0. best_e = 0 best_found = False for e in range(self.num_episodes): env_info = self.env.reset( train_mode=True)[self.brain_name] # reset the environment states = env_info.vector_observations # get the current state (s_t) scores = np.zeros(const.num_agents) # initialize the score self.agent_1.reset() self.agent_2.reset() while True: # choose a_t using epsilon-greedy policy states_input = states.flatten() a_1 = self.agent_1.act(states_input) a_2 = self.agent_2.act(states_input) actions_input = np.array([a_1, a_2]).flatten() # take action a_t, observe r_{t+1} and s_{t+1} env_info = self.env.step(actions_input)[ self.brain_name] # send the action to the environment next_states = env_info.vector_observations # get the next state rewards = env_info.rewards # get the reward dones = env_info.local_done # see if episode has finished # Memorize new sample, replay, update target network next_states_input = next_states.flatten() self.agent_1.do_stuff(states_input, actions_input, rewards[0], next_states_input, dones[0], 0) self.agent_2.do_stuff(states_input, actions_input, rewards[1], next_states_input, dones[1], 1) states = next_states scores += rewards if np.any(dones): break score = np.max( scores) # max of scores over all agents for this episode rolling_window.append(score) history.append(score) curr_best_score = np.mean(rolling_window) print("\r -> Episode: {}/{}, score: {:.3f}, avg_score: {:.3f}". format(e + 1, self.num_episodes, score, curr_best_score), end='') # if (e + 1) % 100 == 0 or e + 1 == self.num_episodes: if curr_best_score > best_score or (e + 1 == self.num_episodes and not best_found): best_score = curr_best_score best_e = e + 1 best_found = True self.agent_1.save() self.agent_2.save() if np.mean(rolling_window) >= const.high_score and not solved: print( '\nEnv solved in {:d} episodes, avg_score: {:.3f}'.format( e + 1, np.mean(rolling_window))) solved = True # plot scores const.myprint('History:', history) utils_plot.plot_history_rolling_mean(history, fp=self.image_path) if with_close: self.env.close() return history, best_e, best_score
def save(self): const.myprint('Saving model to:', self.model_path) self.model.save_weights(str(self.model_path))
def load(self): const.myprint('Loading model from:', self.model_path) self.model.load_weights(str(self.model_path))