Exemplo n.º 1
0
 def train(self, num_run=1, restore=False):
     memory = None
     start_episode = 0
     start_updates = 0
     start_run = 0
     start_total_numsteps = 0
     start_running_episode_reward = 0
     start_running_episode_reward_100 = 0
     start_rewards = []
     start_last_episode_steps = 0
     start_episode_reward = 0
     start_episode_steps = 0
     start_timing = 0
     start_total_timing = 0
     
     # Restore Phase
     if restore:
         # TODO: Not tested deeply yet
         with open(self.folder + "memory.pkl", "rb") as pickle_out:
             memory = ReplayMemory(self.replay_size, self.seed)
             memory.load(pickle_out)
         with open(self.folder + "context.json", "r+") as pickle_out:
             (start_episode, start_run, start_updates, start_total_numsteps, start_running_episode_reward,
              start_running_episode_reward_100, start_last_episode_steps, start_episode_reward, start_episode_steps,
              start_timing, start_total_timing) = json.load(pickle_out)
         with open(self.folder + "rewards.pkl", "rb") as pickle_out:
             start_rewards = pickle.load(pickle_out)
         self.restore_model()
         self.logger.important("Load completed!")
     
     in_ts = time.time()
     
     # Start of the iteration on runs
     for i_run in range(start_run, num_run):
         
         # Break the loop if the phase "Save'n'Close" is triggered
         if self.env.is_save_and_close():
             break
         
         self.logger.important(f"START TRAINING RUN {i_run}")
         
         # Set Seed for repeatability
         torch.manual_seed(self.seed + i_run)
         np.random.seed(self.seed + i_run)
         self.env.seed(self.seed + i_run)
         self.env.action_space.np_random.seed(self.seed + i_run)
         
         # Setup TensorboardX
         writer_train = SummaryWriter(log_dir=self.folder + 'run_' + str(i_run) + '/train')
         writer_learn = SummaryWriter(log_dir=self.folder + 'run_' + str(i_run) + '/learn')
         writer_test = SummaryWriter(log_dir=self.folder + 'run_' + str(i_run) + '/test')
         
         # Setup Replay Memory: create new memory if is not the restore case
         if not restore:
             memory = ReplayMemory(self.replay_size, self.seed)
         # Create a backup memory for Forget-Phase
         backup_memory = copy.deepcopy(memory)
         
         # TRAINING LOOP
         # All these variables must be backed up and restored
         updates = start_updates
         total_numsteps = start_total_numsteps
         running_episode_reward = start_running_episode_reward
         running_episode_reward_100 = start_running_episode_reward_100
         rewards = start_rewards
         i_episode = start_episode
         last_episode_steps = start_last_episode_steps
         episode_reward = start_episode_reward
         episode_steps = start_episode_steps
         timing = start_timing
         total_timing = start_total_timing
         updates_episode = 0
         episode_images = list()
         
         '''
             LOOP: Episode
         '''
         while True:
             
             # Stop the robot
             self.env.stop_all_motors()
             
             # Wait for the human to leave the command
             while self.env.is_human_controlled():
                 pass
             
             # Let's forget (if it is the case)
             if self.env.is_forget_enabled():
                 # print('forget')
                 i_episode -= 1
                 print(len(memory))
                 # Restore Nets
                 self.restore_model()
                 self.env.reset_forget()
                 # Restore Memory
                 memory = copy.deepcopy(backup_memory)
                 print(len(memory))
                 # memory.forget_last(last_episode_steps)
                 self.logger.info("Last Episode Forgotten")
             elif i_episode != start_episode:
                 # LEARNING AND PRINTING PHASE
                 ep_print = i_episode - 1
                 last_episode_steps = episode_steps
                 if self.pics:
                     for i, image in enumerate(episode_images):
                         writer_train.add_image('episode_{}'
                                                .format(str(ep_print)), image.unsqueeze(0),
                                                i)
                 
                 if len(memory) > self.min_replay_size and ep_print > self.warm_up_episodes:
                     updates = self.learning_phase((last_episode_steps // 10) * 10 + 10, memory, updates,
                                                   writer_learn)
                 self.print_nets(writer_train, ep_print)
                 rewards.append(episode_reward)
                 running_episode_reward += (episode_reward - running_episode_reward) / (ep_print + 1)
                 if len(rewards) < 100:
                     running_episode_reward_100 = running_episode_reward
                 else:
                     last_100 = rewards[-100:]
                     running_episode_reward_100 = np.array(last_100).mean()
                 
                 writer_train.add_scalar('reward/train', episode_reward, ep_print)
                 writer_train.add_scalar('reward/steps', last_episode_steps, ep_print)
                 writer_train.add_scalar('reward/running_mean', running_episode_reward, ep_print)
                 writer_train.add_scalar('reward/running_mean_last_100', running_episode_reward_100, ep_print)
                 self.logger.info("Ep. {}/{}, t {}, r_t {}, 100_mean {}, time_spent {}s | {}s "
                                  .format(ep_print, self.num_episode, episode_steps, round(episode_reward, 2),
                                          round(running_episode_reward_100, 2), round(timing, 2),
                                          str(datetime.timedelta(seconds=total_timing))))
             
             # Security Wall, useful for longer training Phase
             while self.env.is_human_controlled():
                 pass
             
             # Let's test (if it is the case)
             if i_episode % self.eval_every == 0 and self.eval and i_episode != 0 and not restore:
                 # print('test')
                 self.test_phase(writer_test, i_run, updates)
                 # Wait for the human to leave the command
                 while self.env.is_human_controlled():
                     pass
             
             # TODO: HP Checkpoint and check correctness of checkpoint restoring
             if i_episode % self.eval_every == 0 and i_episode != 0 and not restore:
                 self.logger.important("Saving context...")
                 self.logger.info("To restart from here set this flag: --restore " + self.folder)
                 # Save Replay, net weights, hp, i_episode and i_run
                 with open(self.folder + "memory.pkl", "wb") as pickle_out:
                     memory.dump(pickle_out)
                 with open(self.folder + "context.json", "w+") as pickle_out:
                     json.dump((i_episode, i_run, updates, total_numsteps, running_episode_reward,
                                running_episode_reward_100, last_episode_steps, episode_reward, episode_steps,
                                timing, total_timing), pickle_out)
                 with open(self.folder + "rewards.pkl", "wb") as pickle_out:
                     pickle.dump(rewards, pickle_out)
                 self.backup_model()
                 if os.path.exists(self.folder[:-1] + "_bak" + self.folder[-1:]):
                     shutil.rmtree(self.folder[:-1] + "_bak" + self.folder[-1:])
                 print(self.folder[:-1] + "_bak" + self.folder[-1:])
                 shutil.copytree(self.folder, self.folder[:-1] + "_bak" + self.folder[-1:])
                 self.logger.important("Save completed!")
             
             # Limit of episode/run reached. Let's start a new RUN
             if i_episode > self.num_episode:
                 break
             
             # Backup NNs and memory (useful in case of Forget Phase)
             self.backup_model()
             backup_memory = copy.deepcopy(memory)
             
             # Setup the episode
             self.logger.important(f"START EPISODE {i_episode}")
             ts = time.time()
             episode_reward = episode_steps = 0
             done = False
             info = {'undo': False}
             state = self.env.reset()
             state_buffer = None
             
             # If you use CNNs, the use of StateBuffer is enabled (see doc).
             if self.pics:
                 state_buffer = StateBuffer(self.state_buffer_size, state)
                 state = state_buffer.get_state()
                 episode_images = list()
             updates_episode = 0
             
             # Start of the episode
             while not done:
                 if self.pics:
                     episode_images.append(state_buffer.get_tensor()[0])
                 
                 if i_episode < self.warm_up_episodes or len(memory) < self.min_replay_size:
                     # Warm_up phase -> Completely random choice of an action
                     action = self.env.action_space.sample()
                 else:
                     # Training phase -> Action sampled from policy
                     action = self.select_action(state)
                 
                 assert action.shape == self.env.action_space.shape
                 assert action is not None
                 writer_train.add_histogram('action_speed/episode_{}'
                                            .format(str(i_episode)), torch.tensor(action[0]), episode_steps)
                 writer_train.add_histogram('action_turn/episode_{}'
                                            .format(str(i_episode)), torch.tensor(action[1]), episode_steps)
                 
                 # Make the action
                 next_state, reward, done, info = self.env.step(action)
                 
                 # Save the step
                 if self.pics:
                     state_buffer.push(next_state)
                     next_state = state_buffer.get_state()
                 episode_steps += 1
                 total_numsteps += 1
                 episode_reward += reward
                 mask = 1 if done else float(not done)
                 
                 # Push the transition in the memory only if n steps is greater than 5
                 # print('push')
                 if episode_steps > 5:
                     memory.push(state, action, reward, next_state, mask)
                 state = next_state
             print("Memory {}/{}".format(len(memory), self.replay_size))
             timing = time.time() - ts
             total_timing = time.time() - in_ts
             start_episode = 0
             i_episode += 1
             # Disable restore phase after the restored run
             restore = False