def test_phase(self, i_run, i_episode, writer_test): total_reward = 0 ts = time.time() for i in range(self.eval_episode): old = self.env.reset() state_buffer = StateBuffer(self.state_buffer_size, old) episode_reward = 0 done = False while not done: state = state_buffer.get_state() action = self.select_action(state, eval=True) next_state, reward, done, _ = self.env.step(action) episode_reward += reward state_buffer.push(next_state) total_reward += episode_reward writer_test.add_scalar('reward/test', total_reward / self.eval_episode, i_episode) self.logger.info("----------------------------------------") self.logger.info( f"Test {self.eval_episode} ep.: {i_episode}, mean_r: {round(total_reward / self.eval_episode, 2)}" f", time_spent {round(time.time() - ts, 2)}s") self.save_model(self.env_name, "./runs/" + self.folder + f"run_{i_run}/", i_episode) self.logger.info('Saving models...') self.logger.info("----------------------------------------")
def do_one_test(self): old = self.env.reset() state_buffer = StateBuffer(self.state_buffer_size, old) episode_reward = 0 done = False while not done: state = state_buffer.get_state() action = self.select_action(state, eval=True) next_state, reward, done, _ = self.env.step(action) episode_reward += reward state_buffer.push(next_state) return episode_reward
def train(self, num_run=1, restore=False): memory = None start_episode = 0 start_updates = 0 start_run = 0 start_total_numsteps = 0 start_running_episode_reward = 0 start_running_episode_reward_100 = 0 start_rewards = [] start_last_episode_steps = 0 start_episode_reward = 0 start_episode_steps = 0 start_timing = 0 start_total_timing = 0 # Restore Phase if restore: # TODO: Not tested deeply yet with open(self.folder + "memory.pkl", "rb") as pickle_out: memory = ReplayMemory(self.replay_size, self.seed) memory.load(pickle_out) with open(self.folder + "context.json", "r+") as pickle_out: (start_episode, start_run, start_updates, start_total_numsteps, start_running_episode_reward, start_running_episode_reward_100, start_last_episode_steps, start_episode_reward, start_episode_steps, start_timing, start_total_timing) = json.load(pickle_out) with open(self.folder + "rewards.pkl", "rb") as pickle_out: start_rewards = pickle.load(pickle_out) self.restore_model() self.logger.important("Load completed!") in_ts = time.time() # Start of the iteration on runs for i_run in range(start_run, num_run): # Break the loop if the phase "Save'n'Close" is triggered if self.env.is_save_and_close(): break self.logger.important(f"START TRAINING RUN {i_run}") # Set Seed for repeatability torch.manual_seed(self.seed + i_run) np.random.seed(self.seed + i_run) self.env.seed(self.seed + i_run) self.env.action_space.np_random.seed(self.seed + i_run) # Setup TensorboardX writer_train = SummaryWriter(log_dir=self.folder + 'run_' + str(i_run) + '/train') writer_learn = SummaryWriter(log_dir=self.folder + 'run_' + str(i_run) + '/learn') writer_test = SummaryWriter(log_dir=self.folder + 'run_' + str(i_run) + '/test') # Setup Replay Memory: create new memory if is not the restore case if not restore: memory = ReplayMemory(self.replay_size, self.seed) # Create a backup memory for Forget-Phase backup_memory = copy.deepcopy(memory) # TRAINING LOOP # All these variables must be backed up and restored updates = start_updates total_numsteps = start_total_numsteps running_episode_reward = start_running_episode_reward running_episode_reward_100 = start_running_episode_reward_100 rewards = start_rewards i_episode = start_episode last_episode_steps = start_last_episode_steps episode_reward = start_episode_reward episode_steps = start_episode_steps timing = start_timing total_timing = start_total_timing updates_episode = 0 episode_images = list() ''' LOOP: Episode ''' while True: # Stop the robot self.env.stop_all_motors() # Wait for the human to leave the command while self.env.is_human_controlled(): pass # Let's forget (if it is the case) if self.env.is_forget_enabled(): # print('forget') i_episode -= 1 print(len(memory)) # Restore Nets self.restore_model() self.env.reset_forget() # Restore Memory memory = copy.deepcopy(backup_memory) print(len(memory)) # memory.forget_last(last_episode_steps) self.logger.info("Last Episode Forgotten") elif i_episode != start_episode: # LEARNING AND PRINTING PHASE ep_print = i_episode - 1 last_episode_steps = episode_steps if self.pics: for i, image in enumerate(episode_images): writer_train.add_image('episode_{}' .format(str(ep_print)), image.unsqueeze(0), i) if len(memory) > self.min_replay_size and ep_print > self.warm_up_episodes: updates = self.learning_phase((last_episode_steps // 10) * 10 + 10, memory, updates, writer_learn) self.print_nets(writer_train, ep_print) rewards.append(episode_reward) running_episode_reward += (episode_reward - running_episode_reward) / (ep_print + 1) if len(rewards) < 100: running_episode_reward_100 = running_episode_reward else: last_100 = rewards[-100:] running_episode_reward_100 = np.array(last_100).mean() writer_train.add_scalar('reward/train', episode_reward, ep_print) writer_train.add_scalar('reward/steps', last_episode_steps, ep_print) writer_train.add_scalar('reward/running_mean', running_episode_reward, ep_print) writer_train.add_scalar('reward/running_mean_last_100', running_episode_reward_100, ep_print) self.logger.info("Ep. {}/{}, t {}, r_t {}, 100_mean {}, time_spent {}s | {}s " .format(ep_print, self.num_episode, episode_steps, round(episode_reward, 2), round(running_episode_reward_100, 2), round(timing, 2), str(datetime.timedelta(seconds=total_timing)))) # Security Wall, useful for longer training Phase while self.env.is_human_controlled(): pass # Let's test (if it is the case) if i_episode % self.eval_every == 0 and self.eval and i_episode != 0 and not restore: # print('test') self.test_phase(writer_test, i_run, updates) # Wait for the human to leave the command while self.env.is_human_controlled(): pass # TODO: HP Checkpoint and check correctness of checkpoint restoring if i_episode % self.eval_every == 0 and i_episode != 0 and not restore: self.logger.important("Saving context...") self.logger.info("To restart from here set this flag: --restore " + self.folder) # Save Replay, net weights, hp, i_episode and i_run with open(self.folder + "memory.pkl", "wb") as pickle_out: memory.dump(pickle_out) with open(self.folder + "context.json", "w+") as pickle_out: json.dump((i_episode, i_run, updates, total_numsteps, running_episode_reward, running_episode_reward_100, last_episode_steps, episode_reward, episode_steps, timing, total_timing), pickle_out) with open(self.folder + "rewards.pkl", "wb") as pickle_out: pickle.dump(rewards, pickle_out) self.backup_model() if os.path.exists(self.folder[:-1] + "_bak" + self.folder[-1:]): shutil.rmtree(self.folder[:-1] + "_bak" + self.folder[-1:]) print(self.folder[:-1] + "_bak" + self.folder[-1:]) shutil.copytree(self.folder, self.folder[:-1] + "_bak" + self.folder[-1:]) self.logger.important("Save completed!") # Limit of episode/run reached. Let's start a new RUN if i_episode > self.num_episode: break # Backup NNs and memory (useful in case of Forget Phase) self.backup_model() backup_memory = copy.deepcopy(memory) # Setup the episode self.logger.important(f"START EPISODE {i_episode}") ts = time.time() episode_reward = episode_steps = 0 done = False info = {'undo': False} state = self.env.reset() state_buffer = None # If you use CNNs, the use of StateBuffer is enabled (see doc). if self.pics: state_buffer = StateBuffer(self.state_buffer_size, state) state = state_buffer.get_state() episode_images = list() updates_episode = 0 # Start of the episode while not done: if self.pics: episode_images.append(state_buffer.get_tensor()[0]) if i_episode < self.warm_up_episodes or len(memory) < self.min_replay_size: # Warm_up phase -> Completely random choice of an action action = self.env.action_space.sample() else: # Training phase -> Action sampled from policy action = self.select_action(state) assert action.shape == self.env.action_space.shape assert action is not None writer_train.add_histogram('action_speed/episode_{}' .format(str(i_episode)), torch.tensor(action[0]), episode_steps) writer_train.add_histogram('action_turn/episode_{}' .format(str(i_episode)), torch.tensor(action[1]), episode_steps) # Make the action next_state, reward, done, info = self.env.step(action) # Save the step if self.pics: state_buffer.push(next_state) next_state = state_buffer.get_state() episode_steps += 1 total_numsteps += 1 episode_reward += reward mask = 1 if done else float(not done) # Push the transition in the memory only if n steps is greater than 5 # print('push') if episode_steps > 5: memory.push(state, action, reward, next_state, mask) state = next_state print("Memory {}/{}".format(len(memory), self.replay_size)) timing = time.time() - ts total_timing = time.time() - in_ts start_episode = 0 i_episode += 1 # Disable restore phase after the restored run restore = False
def train(self, num_run=1): in_ts = time.time() for i_run in range(num_run): self.logger.important(f"START TRAINING RUN {i_run}") # Make the environment # Set Seed for repeatability torch.manual_seed(self.seed + i_run) np.random.seed(self.seed + i_run) self.env.seed(self.seed + i_run) self.env.action_space.np_random.seed(self.seed + i_run) # Setup TensorboardX writer_train = SummaryWriter(log_dir='runs/' + self.folder + 'run_' + str(i_run) + '/train') writer_test = SummaryWriter(log_dir='runs/' + self.folder + 'run_' + str(i_run) + '/test') # Setup Replay Memory memory = ReplayMemory(self.replay_size) # TRAINING LOOP total_numsteps = updates = running_episode_reward = running_episode_reward_100 = 0 rewards = [] i_episode = 0 last_episode_steps = 0 while True: self.env.stop_all_motors() while self.env.is_human_controlled(): continue if self.env.is_forget_enabled(): self.restore_model() memory.forget_last(last_episode_steps) i_episode -= 1 self.logger.info("Last Episode Forgotten") if self.env.is_test_phase(): self.test_phase(i_run, i_episode, writer_test) continue if i_episode > self.num_episode: break self.backup_model() self.logger.important(f"START EPISODE {i_episode}") ts = time.time() episode_reward = episode_steps = 0 done = False info = {'undo': False} state = self.env.reset() state_buffer = None if self.pics: state_buffer = StateBuffer(self.state_buffer_size, state) state = state_buffer.get_state() critic_1_loss_acc = critic_2_loss_acc = policy_loss_acc = ent_loss_acc = alpha_acc = 0 while not done: if self.pics: writer_train.add_image( 'episode_{}'.format(str(i_episode)), state_buffer.get_tensor(), episode_steps) if len(memory) < self.warm_up_steps: action = self.env.action_space.sample() else: action = self.select_action( state) # Sample action from policy if len(memory) > self.batch_size: # Number of updates per step in environment for i in range(self.updates_per_step): # Update parameters of all the networks critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = self.update_parameters( memory, self.batch_size, updates) critic_1_loss_acc += critic_1_loss critic_2_loss_acc += critic_2_loss policy_loss_acc += policy_loss ent_loss_acc += ent_loss alpha_acc += alpha updates += 1 next_state, reward, done, info = self.env.step( action) # Step if self.pics: state_buffer.push(next_state) next_state = state_buffer.get_state() episode_steps += 1 total_numsteps += 1 episode_reward += reward mask = 1 if done else float(not done) memory.push(state, action, reward, next_state, mask) # Append transition to memory state = next_state last_episode_steps = episode_steps i_episode += 1 rewards.append(episode_reward) running_episode_reward += (episode_reward - running_episode_reward) / i_episode if len(rewards) < 100: running_episode_reward_100 = running_episode_reward else: last_100 = rewards[-100:] running_episode_reward_100 = np.array(last_100).mean() writer_train.add_scalar('loss/critic_1', critic_1_loss_acc / episode_steps, i_episode) writer_train.add_scalar('loss/critic_2', critic_2_loss_acc / episode_steps, i_episode) writer_train.add_scalar('loss/policy', policy_loss_acc / episode_steps, i_episode) writer_train.add_scalar('loss/entropy_loss', ent_loss_acc / episode_steps, i_episode) writer_train.add_scalar('entropy_temperature/alpha', alpha_acc / episode_steps, i_episode) writer_train.add_scalar('reward/train', episode_reward, i_episode) writer_train.add_scalar('reward/running_mean', running_episode_reward, i_episode) writer_train.add_scalar('reward/running_mean_last_100', running_episode_reward_100, i_episode) self.logger.info( "Ep. {}/{}, t {}, r_t {}, 100_mean {}, time_spent {}s | {}s " .format( i_episode, self.num_episode, episode_steps, round(episode_reward, 2), round(running_episode_reward_100, 2), round(time.time() - ts, 2), str(datetime.timedelta(seconds=time.time() - in_ts)))) self.env.close()
# Setup Replay Memory memory = ReplayMemory(args.replay_size) # TRAINING LOOP total_numsteps = updates = running_episode_reward = running_episode_reward_100 = 0 rewards = [] for i_episode in itertools.count(1): print(updates) ts = time.time() episode_reward = episode_steps = 0 done = False state = env.reset() if cnn: state_buffer = StateBuffer(args.state_buffer_size, state) state = state_buffer.get_state() critic_1_loss_acc = critic_2_loss_acc = policy_loss_acc = ent_loss_acc = alpha_acc = 0 while not done: # if cnn: # writer_train.add_images('episode_{}'.format(str(i_episode)), state_buffer.get_tensor(), episode_steps) if i_episode < args.warm_up_episode: action = env.action_space.sample() # Sample random action else: action = agent.select_action( state) # Sample action from policy next_state, reward, done, _ = env.step(action) # Step env.render() if cnn: