def trainer(self, policy, fifos, shared_buffer, slopes, start_timestep, num_timesteps, logdir): proc_name = multiprocessing.current_process().name logger.info("Trainer %s started" % proc_name) # must import tensorflow here, otherwise sometimes it conflicts with multiprocessing from common.tensorboard_utils import create_summary_writer, add_summary writer = create_summary_writer(logdir) timestep = start_timestep total_episodes = 0 total_timesteps = 0 total_updates = 0 total_rewards = [] episode_rewards = [] episode_lengths = [] task_rewards = [[] for _ in range(len(slopes))] task_steps = [[] for _ in range(len(slopes))] task_scores = [[] for _ in range(len(slopes))] stats_start = time.time() stats_timesteps = 0 stats_updates = 0 while timestep < num_timesteps: batch_observations = [] batch_preds = [] batch_rewards = [] batch_terminals = [] batch_timesteps = 0 mean_infos = defaultdict(list) queue_sizes = [] # loop over fifos from all runners for q, fifo in enumerate(fifos): try: # Queue.qsize() is not implemented on Mac, ignore as it is used only for diagnostics try: queue_sizes.append(fifo.qsize()) except NotImplementedError: pass # wait for a new trajectory and statistics observations, preds, rewards, terminals, episode_reward, episode_length, episode_tasks, episode_steps, mean_info = \ fifo.get(timeout=self.args.queue_timeout) #print("TRAINER EPISODE REWARDS:", episode_reward) #print("TRAINER EPISODE TASKS:", episode_tasks) # add to batch batch_observations.append(observations) batch_preds.append(preds) batch_rewards.append(rewards) batch_terminals.append(terminals) # log statistics total_rewards += episode_reward episode_rewards += episode_reward episode_lengths += episode_length batch_timesteps += len(observations) for task_id, step, reward in zip(episode_tasks, episode_steps, episode_reward): task_rewards[task_id].append(reward) task_steps[task_id].append(step) task_scores[task_id].append(reward) for key, val in mean_info.items(): mean_infos[key].append(val) except Empty: # just ignore empty fifos, batch will be smaller pass # estimate learning curve slope for each task for task_id, (scores, steps) in enumerate(zip(task_scores, task_steps)): if len(scores) > 1: #print("BEFORE TASK %d scores:" % task_id, scores) #print("BEFORE TASK %d steps:" % task_id, steps) # use episodes from last curriculum_steps to estimate slope idx = np.where( np.array(steps) > (steps[-1] - self.args.curriculum_steps))[0] #print("TASK %d idx:" % task_id, idx) scores = np.array(scores) steps = np.array(steps) # if there are less then 2 episodes then add back some episodes if len(idx) == 1: # add one episode before the first idx = np.concatenate([[idx[0] - 1], idx]) print("INSERTED ONE:", idx) scores = scores[idx] steps = steps[idx] #print("AFTER TASK %d scores:" % task_id, scores) #print("AFTER TASK %d steps:" % task_id, steps) slope = estimate_slope(steps, scores) if self.args.curriculum_abs: slope = np.abs(slope) print("TASK %d slope:" % task_id, slope) slopes[task_id] = slope # if any of the runners produced trajectories if len(batch_observations) > 0: timestep += batch_timesteps # reorder dimensions for preds batch_preds = [list(zip(*p)) for p in batch_preds] batch_preds = list(zip(*batch_preds)) # train model policy.train(batch_observations, batch_preds, batch_rewards, batch_terminals, timestep, writer) # share model parameters shared_buffer.raw = pickle.dumps(policy.get_weights(), pickle.HIGHEST_PROTOCOL) total_timesteps += batch_timesteps total_updates += self.args.repeat_updates stats_timesteps += batch_timesteps stats_updates += self.args.repeat_updates for key, val in mean_infos.items(): add_summary(writer, "diagnostics/" + key, np.mean(val), timestep) if timestep % self.args.stats_interval == 0: total_episodes += len(episode_rewards) stats_time = time.time() - stats_start add_summary(writer, "game_stats/episodes", len(episode_rewards), timestep) add_summary(writer, "game_stats/episode_reward_mean", np.mean(episode_rewards), timestep) #add_summary(writer, "game_stats/episode_reward_stddev", np.std(episode_rewards), timestep) add_summary(writer, "game_stats/episode_length_mean", np.mean(episode_lengths), timestep) #add_summary(writer, "game_stats/episode_length_stddev", np.std(episode_lengths), timestep) add_summary(writer, "game_stats/total_episodes", total_episodes, timestep) add_summary(writer, "game_stats/total_timesteps", total_timesteps, timestep) add_summary(writer, "game_stats/total_updates", total_updates, timestep) add_summary(writer, "performance/updates_per_second", stats_updates / stats_time, timestep) add_summary(writer, "performance/timesteps_per_second", stats_timesteps / stats_time, timestep) add_summary( writer, "performance/estimated_runner_fps", stats_timesteps / self.args.num_runners / stats_time, timestep) add_summary(writer, "performance/mean_queue_length", np.mean(queue_sizes), timestep) for i, rewards in enumerate(task_rewards): add_summary( writer, "curriculum_rewards/task%d_reward_mean" % i, np.mean(rewards), timestep) add_summary(writer, "curriculum_episodes/task%d_episodes" % i, len(rewards), timestep) for i, slope in enumerate(slopes): add_summary(writer, "curriculum_slopes/task%d_slope" % i, slope, timestep) logger.info( "Step %d/%d: episodes %d, mean episode reward %.2f, mean episode length %.2f, timesteps/sec %.2f." % (timestep, num_timesteps, len(episode_rewards), np.mean(episode_rewards), np.mean(episode_lengths), stats_timesteps / stats_time)) episode_rewards = [] episode_lengths = [] task_rewards = [[] for _ in range(len(slopes))] stats_start = time.time() stats_timesteps = 0 stats_updates = 0 if timestep % self.args.save_interval == 0: policy.save_weights( os.path.join(logdir, "weights_%d.hdf5" % timestep)) #else: #logger.warn("Empty batch, runners are falling behind!") # save final weights policy.save_weights(os.path.join(logdir, "weights_%d.hdf5" % timestep)) if self.args.csv_file: # save command-line parameters and most important performance metrics to file data = vars(self.args) data['episode_reward_mean'] = np.mean(total_rewards) data['total_episodes'] = total_episodes data['total_timesteps'] = total_timesteps data['total_updates'] = total_updates header = sorted(data.keys()) # write the CSV file one directory above the experiment directory csv_file = os.path.join(os.path.dirname(logdir), self.args.csv_file) file_exists = os.path.isfile(csv_file) with open(csv_file, 'a') as file: writer = csv.DictWriter(file, delimiter=',', fieldnames=header) if not file_exists: writer.writeheader() writer.writerow(data) # collect child processes while len(multiprocessing.active_children()) > 0: for fifo in fifos: # empty fifos just in case runners are waiting after them try: fifo.get(timeout=1) except Empty: pass logger.info("Trainer %s finished" % proc_name)
def trainer(self, policy, fifo, shared_buffer, start_timestep, num_timesteps, logdir): proc_name = multiprocessing.current_process().name logger.info("Trainer %s started" % proc_name) # must import tensorflow here, otherwise sometimes it conflicts with multiprocessing from common.tensorboard_utils import create_summary_writer, add_summary writer = create_summary_writer(logdir) timestep = start_timestep total_episodes = 0 total_timesteps = 0 total_updates = 0 total_rewards = [] episode_rewards = [] episode_lengths = [] stats_start = time.time() stats_timesteps = 0 stats_updates = 0 queue_sizes = [] while timestep < num_timesteps: mean_infos = defaultdict(list) # Queue.qsize() is not implemented on Mac, ignore as it is used only for diagnostics try: queue_sizes.append(fifo.qsize()) except NotImplementedError: pass # wait for a new trajectory and statistics batch_observations, batch_preds, batch_rewards, batch_terminals, episode_reward, episode_length, mean_info = fifo.get( ) # log statistics total_rewards += episode_reward episode_rewards += episode_reward episode_lengths += episode_length batch_timesteps = np.prod(batch_observations.shape[:2]) for key, val in mean_info.items(): mean_infos[key].append(val) timestep += batch_timesteps # train model policy.train(batch_observations, batch_preds, batch_rewards, batch_terminals, timestep, writer) # share model parameters shared_buffer.raw = pickle.dumps(policy.get_weights(), pickle.HIGHEST_PROTOCOL) total_timesteps += batch_timesteps total_updates += self.args.repeat_updates stats_timesteps += batch_timesteps stats_updates += self.args.repeat_updates for key, val in mean_infos.items(): add_summary(writer, "diagnostics/" + key, np.mean(val), timestep) if timestep % self.args.stats_interval == 0: total_episodes += len(episode_rewards) stats_time = time.time() - stats_start add_summary(writer, "game_stats/episodes", len(episode_rewards), timestep) add_summary(writer, "game_stats/episode_reward_mean", np.mean(episode_rewards), timestep) #add_summary(writer, "game_stats/episode_reward_stddev", np.std(episode_rewards), timestep) add_summary(writer, "game_stats/episode_length_mean", np.mean(episode_lengths), timestep) #add_summary(writer, "game_stats/episode_length_stddev", np.std(episode_lengths), timestep) add_summary(writer, "game_stats/total_episodes", total_episodes, timestep) add_summary(writer, "game_stats/total_timesteps", int(total_timesteps), timestep) add_summary(writer, "game_stats/total_updates", total_updates, timestep) add_summary(writer, "performance/updates_per_second", stats_updates / stats_time, timestep) add_summary(writer, "performance/timesteps_per_second", stats_timesteps / stats_time, timestep) add_summary( writer, "performance/estimated_runner_fps", stats_timesteps / self.args.num_runners / stats_time, timestep) add_summary(writer, "performance/mean_queue_length", np.mean(queue_sizes), timestep) logger.info( "Step %d/%d: episodes %d, mean episode reward %.2f, mean episode length %.2f, timesteps/sec %.2f." % (timestep, num_timesteps, len(episode_rewards), np.mean(episode_rewards), np.mean(episode_lengths), stats_timesteps / stats_time)) episode_rewards = [] episode_lengths = [] stats_start = time.time() stats_timesteps = 0 stats_updates = 0 queue_sizes = [] if timestep % self.args.save_interval == 0: policy.save_weights( os.path.join(logdir, "weights_%d.hdf5" % timestep)) # save final weights policy.save_weights(os.path.join(logdir, "weights_%d.hdf5" % timestep)) if self.args.csv_file: # save command-line parameters and most important performance metrics to file data = vars(self.args) data['episode_reward_mean'] = np.mean(total_rewards) data['total_episodes'] = total_episodes data['total_timesteps'] = total_timesteps data['total_updates'] = total_updates header = sorted(data.keys()) # write the CSV file one directory above the experiment directory csv_file = os.path.join(os.path.dirname(logdir), self.args.csv_file) file_exists = os.path.isfile(csv_file) with open(csv_file, 'a') as file: writer = csv.DictWriter(file, delimiter=',', fieldnames=header) if not file_exists: writer.writeheader() writer.writerow(data) # collect child processes while len(multiprocessing.active_children()) > 0: # empty fifos just in case runners are waiting after them try: fifo.get(timeout=1) except Empty: pass logger.info("Trainer %s finished" % proc_name)