def __init__(self, params): ############# ## INIT ############# # Get params, create logger, create TF session self.params = params self.logger = Logger(self.params['logdir']) self.sess = create_tf_session(self.params['use_gpu'], which_gpu=self.params['which_gpu']) # Set random seeds seed = self.params['seed'] tf.set_random_seed(seed) np.random.seed(seed) ############# ## ENV ############# # Make the gym environment self.env = gym.make(self.params['env_name']) self.env.seed(seed) # Maximum length for episodes self.params['ep_len'] = self.params[ 'ep_len'] or self.env.spec.max_episode_steps MAX_VIDEO_LEN = self.params['ep_len'] # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) self.params['agent_params']['discrete'] = discrete # Observation and action sizes ob_dim = self.env.observation_space.shape[0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[ 0] self.params['agent_params']['ac_dim'] = ac_dim self.params['agent_params']['ob_dim'] = ob_dim # simulation timestep, will be used for video saving if 'model' in dir(self.env): self.fps = 1 / self.env.model.opt.timestep else: self.fps = self.env.env.metadata['video.frames_per_second'] ############# ## AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class(self.sess, self.env, self.params['agent_params']) ############# ## INIT VARS ############# tf.global_variables_initializer().run(session=self.sess)
def __init__(self, params): ############# # INIT ############# # Get params, create logger, create TF session self.params = params self.logger = Logger(self.params['logdir']) # Set random seeds seed = self.params['seed'] np.random.seed(seed) torch.manual_seed(seed) ptu.init_gpu( use_gpu=not self.params['no_gpu'], gpu_id=self.params['which_gpu'] ) self.total_env_steps = 0 self.start_time = None self.log_video = False self.log_metrics = False self.initial_return = None ############# # ENV ############# # Make the gym environment self.env = gym.make(self.params['env_name']) self.env.seed(seed) # Maximum length for episodes self.params['ep_len'] = self.params['ep_len'] or self.env.spec.max_episode_steps MAX_VIDEO_LEN = self.params['ep_len'] # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) self.params['agent_params']['discrete'] = discrete # Observation and action sizes ob_dim = self.env.observation_space.shape[0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[0] self.params['agent_params']['ac_dim'] = ac_dim self.params['agent_params']['ob_dim'] = ob_dim # simulation time step, will be used for video saving if 'model' in dir(self.env): self.fps = 1 / self.env.model.opt.timestep else: self.fps = self.env.env.metadata['video.frames_per_second'] ############# # AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class(self.env, self.params['agent_params'])
def __init__(self, params): ############# ## INIT ############# # Get params, create logger, create TF session self.params = params self.logger = Logger(self.params["logdir"]) # Set random seeds seed = self.params["seed"] np.random.seed(seed) torch.manual_seed(seed) ptu.init_gpu(use_gpu=not self.params["no_gpu"], gpu_id=self.params["which_gpu"]) ############# ## ENV ############# # Make the gym environment self.env = gym.make(self.params["env_name"]) self.env.seed(seed) # Maximum length for episodes self.params["ep_len"] = self.params[ "ep_len"] or self.env.spec.max_episode_steps MAX_VIDEO_LEN = self.params[ "ep_len"] if not "Humanoid" in self.params["env_name"] else 1000 # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) self.params["agent_params"]["discrete"] = discrete # Observation and action sizes ob_dim = self.env.observation_space.shape[0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[ 0] self.params["agent_params"]["ac_dim"] = ac_dim self.params["agent_params"]["ob_dim"] = ob_dim # simulation timestep, will be used for video saving if "model" in dir(self.env): self.fps = 1 / self.env.model.opt.timestep else: self.fps = self.env.env.metadata["video.frames_per_second"] ############# ## AGENT ############# agent_class = self.params["agent_class"] self.agent = agent_class(self.env, self.params["agent_params"])
def __init__(self, params): ############# ## INIT ############# # Get params, create logger, create TF session self.params = params self.logger = Logger(self.params['logdir']) # Set random seeds seed = self.params['seed'] torch.manual_seed(seed) np.random.seed(seed) ############# ## ENV ############# # Make the gym environment self.env = gym.make(self.params['env_name']) self.env.seed(seed) # Maximum length for episodes self.params['ep_len'] = self.params[ 'ep_len'] or self.env.spec.max_episode_steps # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) self.params['agent_params']['discrete'] = discrete # Observation and action sizes ob_dim = self.env.observation_space.shape[0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[ 0] self.params['agent_params']['ac_dim'] = ac_dim self.params['agent_params']['ob_dim'] = ob_dim #include correct device self.params['agent_params']['device'] = self.params['device'] # simulation timestep, will be used for video saving if 'model' in dir(self.env): self.fps = 1 / self.env.model.opt.timestep else: self.fps = self.env.env.metadata['video.frames_per_second'] ############# ## AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class(self.env, self.params['agent_params'])
def __init__(self, params): ############# ## INIT ############# # Get params, create logger, create TF session self.params = params self.logger = Logger(self.params['logdir']) # Set random seeds seed = self.params['seed'] torch.manual_seed(seed) np.random.seed(seed) ############# ## ENV ############# # Make the gym environment if self.params['env_name'] == 'PointMass-v0': from cs285.envs.pointmass import PointMass self.env = PointMass() else: self.env = gym.make(self.params['env_name']) self.env.seed(seed) self.params['agent_params']['env_name'] = self.params['env_name'] self.max_path_length = self.params[ 'max_path_length'] or self.env.spec.max_episode_steps # Maximum length for episodes self.params['ep_len'] = self.params[ 'ep_len'] or self.env.spec.max_episode_steps # Is this env continuous, or self.discrete? self.params['agent_params']['discrete'] = isinstance( self.env.action_space, gym.spaces.Discrete) # Observation and action sizes self.params['agent_params'][ 'ob_dim'] = self.env.observation_space.shape[0] self.params['agent_params'][ 'ac_dim'] = self.env.action_space.n if self.params['agent_params'][ 'discrete'] else self.env.action_space.shape[0] ############# ## AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class(self.env, self.params['agent_params'])
def __init__(self, params): ############# ## INIT ############# # Get params, create logger, create TF session self.params = params self.logger = Logger(self.params['logdir']) ############# ## ENV ############# self.env = OwnEnv() self.env.reset() self.mean_episode_reward = -float('nan') self.best_mean_episode_reward = -float('inf') # Observation and action sizes ob_dim = self.env.observation_space.shape if img else self.env.observation_space.shape[ 0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[ 0] print("ob_dim: ", ob_dim) self.params['agent_params']['ac_dim'] = ac_dim self.params['agent_params']['ob_dim'] = ob_dim ############# ## AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class(self.env, self.params['agent_params'])
def __init__(self, params): ##INIT self.params = params self.logger = Logger(self.params['logdir']) #TODO LOGGER seed = self.params['seed'] np.random.seed(seed) ##ENV self.env = gym.make(self.params['env_name']) self.env.seed(seed) #max length of episodes self.params['ep_len'] = self.params[ 'ep_len'] or self.env.spec.max_episode_steps MAX_VIDEO_LEN = self.params['ep_len'] #Check discrete or continuous discrete = isinstance(self.env.action_space, gym.spaces.Discrete) self.params['agent_params']['discrete'] = discrete ob_dim = self.env.observation_space.shape[0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[ 0] self.params['agent_params']['ac_dim'] = ac_dim self.params['agent_params']['ob_dim'] = ob_dim #video save if 'model' in dir(self.env): self.fps = 1 / self.env.model.opt.timestep #what is model #else mostly I guess else: self.fps = self.env.env.metadata['video.frames_per_second'] ##AGENT agent_class = self.params['agent_class'] self.agent = agent_class(self.env, self.params['agent_params'])
class RL_Trainer(object): def __init__(self, params): ############# ## INIT ############# # Get params, create logger, create TF session self.params = params self.logger = Logger(self.params['logdir']) # Set random seeds seed = self.params['seed'] torch.manual_seed(seed) np.random.seed(seed) ############# ## ENV ############# # Make the gym environment if self.params['env_name'] == 'PointMass-v0': from cs285.envs.pointmass import PointMass self.env = PointMass() else: self.env = gym.make(self.params['env_name']) self.env.seed(seed) self.params['agent_params']['env_name'] = self.params['env_name'] self.max_path_length = self.params[ 'max_path_length'] or self.env.spec.max_episode_steps # Maximum length for episodes self.params['ep_len'] = self.params[ 'ep_len'] or self.env.spec.max_episode_steps # Is this env continuous, or self.discrete? self.params['agent_params']['discrete'] = isinstance( self.env.action_space, gym.spaces.Discrete) # Observation and action sizes self.params['agent_params'][ 'ob_dim'] = self.env.observation_space.shape[0] self.params['agent_params'][ 'ac_dim'] = self.env.action_space.n if self.params['agent_params'][ 'discrete'] else self.env.action_space.shape[0] ############# ## AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class(self.env, self.params['agent_params']) def run_training_loop(self, n_iter, policy): # init vars at beginning of training self.total_envsteps = 0 self.start_time = time.time() for itr in range(n_iter): print("\n\n********** Iteration %i ************" % itr) # decide if metrics should be logged if self.params['scalar_log_freq'] == -1: self.logmetrics = False elif itr % self.params['scalar_log_freq'] == 0: self.logmetrics = True else: self.logmetrics = False # collect trajectories, to be used for training paths, envsteps_this_batch = self.collect_training_trajectories( itr, policy, self.params['batch_size']) self.total_envsteps += envsteps_this_batch # add collected data to replay buffer self.agent.add_to_replay_buffer(paths) # train agent (using sampled data from replay buffer) loss, ex2_vars = self.train_agent() # log/save if self.logmetrics: # perform logging print('\nBeginning logging procedure...') self.perform_logging(itr, paths, policy, loss, ex2_vars) #################################### #################################### def collect_training_trajectories(self, itr, policy, batch_size): print("\nCollecting data to be used for training...") paths, envsteps_this_batch = sample_trajectories( self.env, policy, batch_size, self.max_path_length, self.params['render'], itr) return paths, envsteps_this_batch def train_agent(self): #print('\nTraining agent using sampled data from replay buffer...') for train_step in range(self.params['num_agent_train_steps_per_iter']): ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = self.agent.sample( self.params['batch_size']) loss, ex2_vars = self.agent.train(ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch) return loss, ex2_vars #################################### def perform_logging(self, itr, paths, eval_policy, loss, ex2_vars): if self.logmetrics: # returns, for logging train_returns = [path["reward"].sum() for path in paths] # episode lengths, for logging train_ep_lens = [len(path["reward"]) for path in paths] # decide what to log logs = OrderedDict() if ex2_vars != None: logs["Log_Likelihood_Average"] = np.mean(ex2_vars[0]) logs["KL_Divergence_Average"] = np.mean(ex2_vars[1]) logs["ELBO_Average"] = np.mean(ex2_vars[2]) logs["Train_AverageReturn"] = np.mean(train_returns) logs["Train_StdReturn"] = np.std(train_returns) logs["Train_MaxReturn"] = np.max(train_returns) logs["Train_MinReturn"] = np.min(train_returns) logs["Train_AverageEpLen"] = np.mean(train_ep_lens) logs["Train_EnvstepsSoFar"] = self.total_envsteps logs["TimeSinceStart"] = time.time() - self.start_time if isinstance(loss, dict): logs.update(loss) else: logs["Training loss"] = loss if itr == 0: self.initial_return = np.mean(train_returns) logs["Initial_DataCollection_AverageReturn"] = self.initial_return # perform the logging for key, value in logs.items(): print('{} : {}'.format(key, value)) self.logger.log_scalar(value, key, itr) print('Done logging...\n\n') self.logger.flush()
class RL_Trainer(object): def __init__(self, params): ############# ## INIT ############# # Get params, create logger self.params = params self.logger = Logger(self.params['logdir']) # Set random seeds seed = self.params['seed'] np.random.seed(seed) torch.manual_seed(seed) ptu.init_gpu( use_gpu=not self.params['no_gpu'], gpu_id=self.params['which_gpu'] ) ############# ## ENV ############# # Make the gym environment self.env = gym.make(self.params['env_name']) if 'env_wrappers' in self.params: # These operations are currently only for Atari envs self.env = wrappers.Monitor(self.env, os.path.join(self.params['logdir'], "gym"), force=True) self.env = params['env_wrappers'](self.env) self.mean_episode_reward = -float('nan') self.best_mean_episode_reward = -float('inf') if 'non_atari_colab_env' in self.params and self.params['video_log_freq'] > 0: self.env = wrappers.Monitor(self.env, os.path.join(self.params['logdir'], "gym"), force=True) self.mean_episode_reward = -float('nan') self.best_mean_episode_reward = -float('inf') self.env.seed(seed) # import plotting (locally if 'obstacles' env) if not(self.params['env_name']=='obstacles-cs285-v0'): import matplotlib matplotlib.use('Agg') # Maximum length for episodes self.params['ep_len'] = self.params['ep_len'] or self.env.spec.max_episode_steps global MAX_VIDEO_LEN MAX_VIDEO_LEN = self.params['ep_len'] # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) # Are the observations images? img = len(self.env.observation_space.shape) > 2 self.params['agent_params']['discrete'] = discrete # Observation and action sizes ob_dim = self.env.observation_space.shape if img else self.env.observation_space.shape[0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[0] self.params['agent_params']['ac_dim'] = ac_dim self.params['agent_params']['ob_dim'] = ob_dim # simulation timestep, will be used for video saving if 'model' in dir(self.env): self.fps = 1/self.env.model.opt.timestep elif 'env_wrappers' in self.params: self.fps = 30 # This is not actually used when using the Monitor wrapper elif 'video.frames_per_second' in self.env.env.metadata.keys(): self.fps = self.env.env.metadata['video.frames_per_second'] else: self.fps = 10 ############# ## AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class(self.env, self.params['agent_params']) def run_training_loop(self, n_iter, collect_policy, eval_policy, initial_expertdata=None): """ :param n_iter: number of (dagger) iterations :param collect_policy: :param eval_policy: :param initial_expertdata: """ # init vars at beginning of training self.total_envsteps = 0 self.start_time = time.time() print_period = 1 for itr in range(n_iter): if itr % print_period == 0: print("\n\n********** Iteration %i ************"%itr) # decide if videos should be rendered/logged at this iteration if itr % self.params['video_log_freq'] == 0 and self.params['video_log_freq'] != -1: self.logvideo = True else: self.logvideo = False # decide if metrics should be logged if self.params['scalar_log_freq'] == -1: self.logmetrics = False elif itr % self.params['scalar_log_freq'] == 0: self.logmetrics = True else: self.logmetrics = False use_batchsize = self.params['batch_size'] if itr == 0: use_batchsize = self.params['batch_size_initial'] paths, envsteps_this_batch, train_video_paths = ( self.collect_training_trajectories( itr, initial_expertdata, collect_policy, use_batchsize) ) self.total_envsteps += envsteps_this_batch # add collected data to replay buffer if isinstance(self.agent, MBAgent): self.agent.add_to_replay_buffer(paths, self.params['add_sl_noise']) else: self.agent.add_to_replay_buffer(paths) # train agent (using sampled data from replay buffer) if itr % print_period == 0: print("\nTraining agent...") all_logs = self.train_agent() # if there is a model, log model predictions if isinstance(self.agent, MBAgent) and itr == 0: self.log_model_predictions(itr, all_logs) # log/save if self.logvideo or self.logmetrics: # perform logging print('\nBeginning logging procedure...') self.perform_logging(itr, paths, eval_policy, train_video_paths, all_logs) if self.params['save_params']: self.agent.save('{}/agent_itr_{}.pt'.format(self.params['logdir'], itr)) #################################### #################################### def collect_training_trajectories( self, itr: int, initial_expertdata: str, collect_policy: BasePolicy, num_transitions_to_sample: int, save_expert_data_to_disk: bool = False, ) -> Tuple[List[PathDict], int, Optional[List[PathDict]]]: """ :param itr: :param load_initial_expertdata: path to expert data pkl file :param collect_policy: the current policy using which we collect data :param num_transitions_to_sample: the number of transitions we collect :return: paths: a list trajectories envsteps_this_batch: the sum over the numbers of environment steps in paths train_video_paths: paths which also contain videos for visualization purposes """ paths: List[PathDict] if itr == 0: if initial_expertdata is not None: paths = pickle.load(open(self.params['expert_data'], 'rb')) return paths, 0, None if save_expert_data_to_disk: num_transitions_to_sample = self.params['batch_size_initial'] # collect data to be used for training print("\nCollecting data to be used for training...") paths, envsteps_this_batch = utils.sample_trajectories(self.env, collect_policy, num_transitions_to_sample, self.params['ep_len']) # collect more rollouts with the same policy, to be saved as videos in tensorboard train_video_paths = None if self.logvideo: print('\nCollecting train rollouts to be used for saving videos...') train_video_paths = utils.sample_n_trajectories(self.env, collect_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) if save_expert_data_to_disk and itr == 0: with open('expert_data_{}.pkl'.format(self.params['env_name']), 'wb') as file: pickle.dump(paths, file) return paths, envsteps_this_batch, train_video_paths def train_agent(self): all_logs = [] for train_step in range(self.params['num_agent_train_steps_per_iter']): ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = self.agent.sample(self.params['train_batch_size']) train_log = self.agent.train(ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch) all_logs.append(train_log) return all_logs #################################### #################################### def perform_logging(self, itr, paths, eval_policy, train_video_paths, all_logs): last_log = all_logs[-1] ####################### # collect eval trajectories, for logging print("\nCollecting data for eval...") eval_paths, eval_envsteps_this_batch = utils.sample_trajectories(self.env, eval_policy, self.params['eval_batch_size'], self.params['ep_len']) # save eval rollouts as videos in tensorboard event file if self.logvideo and train_video_paths != None: print('\nCollecting video rollouts eval') eval_video_paths = utils.sample_n_trajectories(self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) #save train/eval videos print('\nSaving train rollouts as videos...') self.logger.log_paths_as_videos(train_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title='train_rollouts') self.logger.log_paths_as_videos(eval_video_paths, itr, fps=self.fps,max_videos_to_save=MAX_NVIDEO, video_title='eval_rollouts') ####################### # save eval metrics if self.logmetrics: # returns, for logging train_returns = [path["reward"].sum() for path in paths] eval_returns = [eval_path["reward"].sum() for eval_path in eval_paths] # episode lengths, for logging train_ep_lens = [len(path["reward"]) for path in paths] eval_ep_lens = [len(eval_path["reward"]) for eval_path in eval_paths] # decide what to log logs = OrderedDict() logs["Eval_AverageReturn"] = np.mean(eval_returns) logs["Eval_StdReturn"] = np.std(eval_returns) logs["Eval_MaxReturn"] = np.max(eval_returns) logs["Eval_MinReturn"] = np.min(eval_returns) logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens) logs["Train_AverageReturn"] = np.mean(train_returns) logs["Train_StdReturn"] = np.std(train_returns) logs["Train_MaxReturn"] = np.max(train_returns) logs["Train_MinReturn"] = np.min(train_returns) logs["Train_AverageEpLen"] = np.mean(train_ep_lens) logs["Train_EnvstepsSoFar"] = self.total_envsteps logs["TimeSinceStart"] = time.time() - self.start_time logs.update(last_log) if itr == 0: self.initial_return = np.mean(train_returns) logs["Initial_DataCollection_AverageReturn"] = self.initial_return # perform the logging for key, value in logs.items(): print('{} : {}'.format(key, value)) self.logger.log_scalar(value, key, itr) print('Done logging...\n\n') self.logger.flush() def log_model_predictions(self, itr, all_logs): # model predictions import matplotlib.pyplot as plt self.fig = plt.figure() # sample actions action_sequence = self.agent.actor.sample_action_sequences(num_sequences=1, horizon=10) #20 reacher action_sequence = action_sequence[0] # calculate and log model prediction error mpe, true_states, pred_states = utils.calculate_mean_prediction_error(self.env, action_sequence, self.agent.dyn_models, self.agent.actor.data_statistics) assert self.params['agent_params']['ob_dim'] == true_states.shape[1] == pred_states.shape[1] ob_dim = self.params['agent_params']['ob_dim'] ob_dim = 2*int(ob_dim/2.0) ## skip last state for plotting when state dim is odd # plot the predictions self.fig.clf() for i in range(ob_dim): plt.subplot(ob_dim/2, 2, i+1) plt.plot(true_states[:,i], 'g') plt.plot(pred_states[:,i], 'r') self.fig.suptitle('MPE: ' + str(mpe)) self.fig.savefig(self.params['logdir']+'/itr_'+str(itr)+'_predictions.png', dpi=200, bbox_inches='tight') # plot all intermediate losses during this iteration all_losses = np.array([log['Training Loss'] for log in all_logs]) np.save(self.params['logdir']+'/itr_'+str(itr)+'_losses.npy', all_losses) self.fig.clf() plt.plot(all_losses) self.fig.savefig(self.params['logdir']+'/itr_'+str(itr)+'_losses.png', dpi=200, bbox_inches='tight')
class RL_Trainer(object): def __init__(self, params): ############# ## INIT ############# # Get params, create logger, create TF session self.params = params self.logger = Logger(self.params["logdir"]) self.sess = create_tf_session(self.params["use_gpu"], which_gpu=self.params["which_gpu"]) # Set random seeds seed = self.params["seed"] tf.set_random_seed(seed) np.random.seed(seed) ############# ## ENV ############# # Make the gym environment self.env = gym.make(self.params["env_name"]) self.env.seed(seed) # Maximum length for episodes self.params["ep_len"] = self.params[ "ep_len"] or self.env.spec.max_episode_steps MAX_VIDEO_LEN = self.params["ep_len"] # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) self.params["agent_params"]["discrete"] = discrete # Observation and action sizes ob_dim = self.env.observation_space.shape[0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[ 0] self.params["agent_params"]["ac_dim"] = ac_dim self.params["agent_params"]["ob_dim"] = ob_dim # simulation timestep, will be used for video saving if "model" in dir(self.env): self.fps = 1 / self.env.model.opt.timestep else: self.fps = self.env.env.metadata["video.frames_per_second"] ############# ## AGENT ############# agent_class = self.params["agent_class"] self.agent = agent_class(self.sess, self.env, self.params["agent_params"]) ############# ## INIT VARS ############# tf.global_variables_initializer().run(session=self.sess) def run_training_loop( self, n_iter, collect_policy, eval_policy, initial_expertdata=None, relabel_with_expert=False, start_relabel_with_expert=1, expert_policy=None, ): """ :param n_iter: number of (dagger) iterations :param collect_policy: :param eval_policy: :param initial_expertdata: :param relabel_with_expert: whether to perform dagger :param start_relabel_with_expert: iteration at which to start relabel with expert :param expert_policy: """ # init vars at beginning of training self.total_envsteps = 0 self.start_time = time.time() for itr in range(n_iter): print("\n\n********** Iteration %i ************" % itr) # decide if videos should be rendered/logged at this iteration if (itr % self.params["video_log_freq"] == 0 and self.params["video_log_freq"] != -1): self.log_video = True else: self.log_video = False # decide if metrics should be logged if itr % self.params["scalar_log_freq"] == 0: self.log_metrics = True else: self.log_metrics = False # collect trajectories, to be used for training training_returns = self.collect_training_trajectories( itr, initial_expertdata, collect_policy, self.params["batch_size"]) paths, envsteps_this_batch, train_video_paths = training_returns self.total_envsteps += envsteps_this_batch # relabel the collected obs with actions from a provided expert policy if relabel_with_expert and itr >= start_relabel_with_expert: paths = self.do_relabel_with_expert(expert_policy, paths) # add collected data to replay buffer self.agent.add_to_replay_buffer(paths) # train agent (using sampled data from replay buffer) self.train_agent() # log/save if self.log_video or self.log_metrics: # perform logging print("\nBeginning logging procedure...") self.perform_logging(itr, paths, eval_policy, train_video_paths) if self.params["save_params"]: # save policy print("\nSaving agent's actor...") self.agent.actor.save(self.params["logdir"] + "/policy_itr_" + str(itr)) #################################### #################################### def collect_training_trajectories(self, itr, load_initial_expertdata, collect_policy, batch_size): # TODO decide whether to load training data or use # HINT: depending on if it's the first iteration or not, # decide whether to either # load the data. In this case you can directly return as follows # ``` return loaded_paths, 0, None ``` # collect data, batch_size is the number of transitions you want to collect. if itr == 0 and load_initial_expertdata: with open(load_initial_expertdata, "rb") as f: loaded_paths = pickle.load(f) return loaded_paths, 0, None # TODO collect data to be used for training # HINT1: use sample_trajectories from utils # HINT2: you want each of these collected rollouts to be of length self.params['ep_len'] print("\nCollecting data to be used for training...") paths, envsteps_this_batch = sample_trajectories( self.env, collect_policy, batch_size, self.params["ep_len"]) # collect more rollouts with the same policy, to be saved as videos in tensorboard # note: here, we collect MAX_NVIDEO rollouts, each of length MAX_VIDEO_LEN train_video_paths = None if self.log_video: print( "\nCollecting train rollouts to be used for saving videos...") ## TODO look in utils and implement sample_n_trajectories train_video_paths = sample_n_trajectories(self.env, collect_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) return paths, envsteps_this_batch, train_video_paths def train_agent(self): print("\nTraining agent using sampled data from replay buffer...") for train_step in range(self.params["num_agent_train_steps_per_iter"]): ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = self.agent.sample( self.params["train_batch_size"]) self.agent.train(ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch) def do_relabel_with_expert(self, expert_policy, paths): print( "\nRelabelling collected observations with labels from an expert policy..." ) # relabel collected obsevations (from our policy) with labels from an expert policy for i in range(len(paths)): paths[i]["action"] = expert_policy.get_action( paths[i]["observation"]) return paths #################################### #################################### def perform_logging(self, itr, paths, eval_policy, train_video_paths): # collect eval trajectories, for logging print("\nCollecting data for eval...") eval_paths, eval_envsteps_this_batch = sample_trajectories( self.env, eval_policy, self.params["eval_batch_size"], self.params["ep_len"]) # save eval rollouts as videos in tensorboard event file if self.log_video and train_video_paths != None: print("\nCollecting video rollouts eval") eval_video_paths = sample_n_trajectories(self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) # save train/eval videos print("\nSaving train rollouts as videos...") self.logger.log_paths_as_videos( train_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title="train_rollouts", ) self.logger.log_paths_as_videos( eval_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title="eval_rollouts", ) # save eval metrics if self.log_metrics: # returns, for logging train_returns = [path["reward"].sum() for path in paths] eval_returns = [ eval_path["reward"].sum() for eval_path in eval_paths ] # episode lengths, for logging train_ep_lens = [len(path["reward"]) for path in paths] eval_ep_lens = [ len(eval_path["reward"]) for eval_path in eval_paths ] # decide what to log logs = OrderedDict() logs["Eval_AverageReturn"] = np.mean(eval_returns) logs["Eval_StdReturn"] = np.std(eval_returns) logs["Eval_MaxReturn"] = np.max(eval_returns) logs["Eval_MinReturn"] = np.min(eval_returns) logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens) logs["Train_AverageReturn"] = np.mean(train_returns) logs["Train_StdReturn"] = np.std(train_returns) logs["Train_MaxReturn"] = np.max(train_returns) logs["Train_MinReturn"] = np.min(train_returns) logs["Train_AverageEpLen"] = np.mean(train_ep_lens) logs["Train_EnvstepsSoFar"] = self.total_envsteps logs["TimeSinceStart"] = time.time() - self.start_time if itr == 0: self.initial_return = np.mean(train_returns) logs["Initial_DataCollection_AverageReturn"] = self.initial_return # perform the logging for key, value in logs.items(): print("{} : {}".format(key, value)) self.logger.log_scalar(value, key, itr) print("Done logging...\n\n") self.logger.flush()
class RL_Trainer(object): def __init__(self, params): ############# ## INIT ############# # Get params, create logger, create TF session self.params = params self.logger = Logger(self.params['logdir']) self.sess = create_tf_session(self.params['use_gpu'], which_gpu=self.params['which_gpu']) # Set random seeds seed = self.params['seed'] tf.set_random_seed(seed) np.random.seed(seed) ############# ## ENV ############# # Make the gym environment self.env = gym.make(self.params['env_name']) self.env.seed(seed) # Maximum length for episodes self.params['ep_len'] = self.params[ 'ep_len'] or self.env.spec.max_episode_steps # 若无定义就用默认值 # Is this env continuous, or self.discrete? # In this case, continous discrete = isinstance(self.env.action_space, gym.spaces.Discrete) self.params['agent_params']['discrete'] = discrete # Observation and action sizes ob_dim = self.env.observation_space.shape[0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[ 0] self.params['agent_params']['ac_dim'] = ac_dim self.params['agent_params']['ob_dim'] = ob_dim # simulation timestep, will be used for video saving if 'model' in dir(self.env): self.fps = 1 / self.env.model.opt.timestep else: self.fps = self.env.env.metadata['video.frames_per_second'] ############# ## AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class( self.sess, self.env, self.params['agent_params']) # 只在这里建了agent的tf graph self.learning_curve = [] ############# ## INIT VARS ############# ## TODO initialize all of the TF variables (that were created by agent, etc.) ## HINT: use global_variables_initializer self.sess.run(tf.global_variables_initializer()) # 每个变量会自己指定初始化的方法 def run_training_loop(self, n_iter, collect_policy, eval_policy, initial_expertdata=None, relabel_with_expert=False, start_relabel_with_expert=1, expert_policy=None): """ :param n_iter: number of (dagger) iterations :param collect_policy: :param eval_policy: :param initial_expertdata: :param relabel_with_expert: whether to perform dagger :param start_relabel_with_expert: iteration at which to start relabel with expert :param expert_policy: """ # init vars at beginning of training self.total_envsteps = 0 self.start_time = time.time() for itr in range(n_iter): # dagger iteration print("\n\n********** Iteration %i ************" % itr) # decide if videos should be rendered/logged at this iteration if itr % self.params['video_log_freq'] == 0 and self.params[ 'video_log_freq'] != -1: self.log_video = True else: self.log_video = False # decide if metrics should be logged if itr % self.params['scalar_log_freq'] == 0: self.log_metrics = True else: self.log_metrics = False # collect trajectories, to be used for training training_returns = self.collect_training_trajectories( itr, initial_expertdata, collect_policy, self. params['batch_size']) ## TODO implement this function below paths, envsteps_this_batch, train_video_paths = training_returns self.total_envsteps += envsteps_this_batch # relabel the collected obs with actions from a provided expert policy # this is for dagger if relabel_with_expert and itr >= start_relabel_with_expert: paths = self.do_relabel_with_expert( expert_policy, paths) ## TODO implement this function below # add collected data to replay buffer self.agent.add_to_replay_buffer(paths) # train agent (using sampled data from replay buffer) self.train_agent() ## TODO implement this function below # log/save if self.log_video or self.log_metrics: # perform logging print('\nBeginning logging procedure...') self.perform_logging(itr, paths, eval_policy, train_video_paths) # save policy print('\nSaving agent\'s actor...') self.agent.actor.save(self.params['logdir'] + '/policy_itr_' + str(itr)) np.save(os.path.join(self.params['logdir'], 'learning_curve.npy'), np.array(self.learning_curve)) #################################### #################################### def collect_training_trajectories(self, itr, load_initial_expertdata, collect_policy, batch_size): """ :param itr: :param load_initial_expertdata: path to expert data pkl file :param collect_policy: the current policy using which we collect data :param batch_size: the number of transitions we collect :return: paths: a list trajectories, type: list envsteps_this_batch: the sum over the numbers of environment steps in paths train_video_paths: paths which also contain videos for visualization purposes """ # TODO decide whether to load training data or use # HINT: depending on if it's the first iteration or not, # decide whether to either # load the data. In this case you can directly return as follows # ``` return loaded_paths, 0, None ``` # collect data, batch_size is the number of transitions you want to collect. # 刚开始先用expert data进行训练 # iter == 0: supervised learning without dagger if (itr == 0) and load_initial_expertdata: print("\nCollecting expert data from {}...".format( load_initial_expertdata)) with open(load_initial_expertdata, 'rb') as f: loaded_paths = pickle.loads(f.read()) return loaded_paths, 0, None # TODO collect data to be used for training # HINT1: use sample_trajectories from utils # HINT2: you want each of these collected rollouts to be of length self.params['ep_len'] # 两个概念: # batch_size:多条轨迹线的总步长 # self.params['ep_len']:单条轨迹线的步长(上限)——这里应该是知道了肯定不会GG所以能保证 print("\nCollecting data to be used for training...") paths, envsteps_this_batch = sample_trajectories( self.env, collect_policy, batch_size, self.params['ep_len']) # collect more rollouts with the same policy, to be saved as videos in tensorboard # note: here, we collect MAX_NVIDEO rollouts, each of length MAX_VIDEO_LEN train_video_paths = None if self.log_video: # 肯定是不一定一样的,但是是同一个policy产生的结果,所以可以类比 print( '\nCollecting train rollouts to be used for saving videos...') ## TODO look in utils and implement sample_n_trajectories train_video_paths = sample_n_trajectories(self.env, collect_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) return paths, envsteps_this_batch, train_video_paths def train_agent(self): print('\nTraining agent using sampled data from replay buffer...') for train_step in range(self.params['num_agent_train_steps_per_iter']): # TODO sample some data from the data buffer # HINT1: use the agent's sample function # HINT2: how much data = self.params['train_batch_size'] # 这里的batch_size是训练的 # 之前的是一个iteration生成数据的batch_szie ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = self.agent.sample( self.params['train_batch_size']) # TODO use the sampled data for training # HINT: use the agent's train function # HINT: print or plot the loss for debugging! self.agent.train(ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch) print('\nBatch {}: loss: {}'.format(train_step, self.agent.actor.batch_loss)) def do_relabel_with_expert(self, expert_policy, paths): print( "\nRelabelling collected observations with labels from an expert policy..." ) # TODO relabel collected obsevations (from our policy) with labels from an expert policy # HINT: query the policy (using the get_action function) with paths[i]["observation"] # and replace paths[i]["action"] with these expert labels for i in range(len(paths)): paths[i]['action'] = expert_policy.get_action( paths[i]['observation']) return paths #################################### #################################### def perform_logging(self, itr, paths, eval_policy, train_video_paths): # collect eval trajectories, for logging print("\nCollecting data for eval...") eval_paths, eval_envsteps_this_batch = sample_trajectories( self.env, eval_policy, self.params['eval_batch_size'], self.params['ep_len']) # save eval rollouts as videos in tensorboard event file if self.log_video and train_video_paths != None: print('\nCollecting video rollouts eval') eval_video_paths = sample_n_trajectories(self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) #save train/eval videos # train是当前训练之前的, eval是当前训练之后的 print('\nSaving train rollouts as videos...') self.logger.log_paths_as_videos(train_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title='train_rollouts') self.logger.log_paths_as_videos(eval_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title='eval_rollouts') # save eval metrics if self.log_metrics: # returns, for logging train_returns = [path["reward"].sum() for path in paths] eval_returns = [ eval_path["reward"].sum() for eval_path in eval_paths ] # episode lengths, for logging train_ep_lens = [len(path["reward"]) for path in paths] eval_ep_lens = [ len(eval_path["reward"]) for eval_path in eval_paths ] # decide what to log logs = OrderedDict() logs["Eval_AverageReturn"] = np.mean(eval_returns) logs["Eval_StdReturn"] = np.std(eval_returns) logs["Eval_MaxReturn"] = np.max(eval_returns) logs["Eval_MinReturn"] = np.min(eval_returns) logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens) self.learning_curve.append(logs["Eval_AverageReturn"]) # for BC, this is the performance of expert logs["Train_AverageReturn"] = np.mean(train_returns) logs["Train_StdReturn"] = np.std(train_returns) logs["Train_MaxReturn"] = np.max(train_returns) logs["Train_MinReturn"] = np.min(train_returns) logs["Train_AverageEpLen"] = np.mean(train_ep_lens) logs["Train_EnvstepsSoFar"] = self.total_envsteps logs["TimeSinceStart"] = time.time() - self.start_time if itr == 0: self.initial_return = np.mean(train_returns) logs["Initial_DataCollection_AverageReturn"] = self.initial_return # perform the logging for key, value in logs.items(): print('{} : {}'.format(key, value)) self.logger.log_scalar(value, key, itr) print('Performance: {}'.format( np.mean(eval_returns) / self.initial_return)) print('Done logging...\n\n') self.logger.flush()
class RL_Trainer(object): def __init__(self, params): ############# ## INIT ############# # Get params, create logger self.params = params self.logger = Logger(self.params['logdir']) # Set random seeds seed = self.params['seed'] np.random.seed(seed) tf.random.set_seed(seed) ############# ## ENV ############# # Make the gym environment register_custom_envs() self.env = gym.make(self.params['env_name']) self.eval_env = gym.make(self.params['env_name']) if not ('pointmass' in self.params['env_name']): import matplotlib matplotlib.use('Agg') self.env.set_logdir(self.params['logdir'] + '/expl_') self.eval_env.set_logdir(self.params['logdir'] + '/eval_') if 'env_wrappers' in self.params: # These operations are currently only for Atari envs self.env = wrappers.Monitor(self.env, os.path.join(self.params['logdir'], "gym"), force=True) self.eval_env = wrappers.Monitor(self.eval_env, os.path.join( self.params['logdir'], "gym"), force=True) self.env = params['env_wrappers'](self.env) self.eval_env = params['env_wrappers'](self.eval_env) self.mean_episode_reward = -float('nan') self.best_mean_episode_reward = -float('inf') if 'non_atari_colab_env' in self.params and self.params[ 'video_log_freq'] > 0: self.env = wrappers.Monitor(self.env, os.path.join(self.params['logdir'], "gym"), write_upon_reset=True) #, force=True) self.eval_env = wrappers.Monitor(self.eval_env, os.path.join( self.params['logdir'], "gym"), write_upon_reset=True) self.mean_episode_reward = -float('nan') self.best_mean_episode_reward = -float('inf') self.env.seed(seed) self.eval_env.seed(seed) # Maximum length for episodes self.params['ep_len'] = self.params[ 'ep_len'] or self.env.spec.max_episode_steps global MAX_VIDEO_LEN MAX_VIDEO_LEN = self.params['ep_len'] # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) # Are the observations images? img = len(self.env.observation_space.shape) > 2 self.params['agent_params']['discrete'] = discrete # Observation and action sizes ob_dim = self.env.observation_space.shape if img else self.env.observation_space.shape[ 0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[ 0] self.params['agent_params']['ac_dim'] = ac_dim self.params['agent_params']['ob_dim'] = ob_dim # simulation timestep, will be used for video saving if 'model' in dir(self.env): self.fps = 1 / self.env.model.opt.timestep elif 'env_wrappers' in self.params: self.fps = 30 # This is not actually used when using the Monitor wrapper elif 'video.frames_per_second' in self.env.env.metadata.keys(): self.fps = self.env.env.metadata['video.frames_per_second'] else: self.fps = 10 ############# ## AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class(self.env, self.params['agent_params']) def run_training_loop(self, n_iter, collect_policy, eval_policy, buffer_name=None, initial_expertdata=None, relabel_with_expert=False, start_relabel_with_expert=1, expert_policy=None): """ :param n_iter: number of (dagger) iterations :param collect_policy: :param eval_policy: :param initial_expertdata: :param relabel_with_expert: whether to perform dagger :param start_relabel_with_expert: iteration at which to start relabel with expert :param expert_policy: """ # init vars at beginning of training self.total_envsteps = 0 self.start_time = time.time() print_period = 1000 if isinstance( self.agent, ExplorationOrExploitationAgent) else 1 for itr in range(n_iter): if itr % print_period == 0: print("\n\n********** Iteration %i ************" % itr) # decide if videos should be rendered/logged at this iteration if itr % self.params['video_log_freq'] == 0 and self.params[ 'video_log_freq'] != -1: self.logvideo = True else: self.logvideo = False # decide if metrics should be logged if self.params['scalar_log_freq'] == -1: self.logmetrics = False elif itr % self.params['scalar_log_freq'] == 0: self.logmetrics = True else: self.logmetrics = False # collect trajectories, to be used for training if isinstance(self.agent, ExplorationOrExploitationAgent): self.agent.step_env() envsteps_this_batch = 1 train_video_paths = None paths = None else: use_batchsize = self.params['batch_size'] if itr == 0: use_batchsize = self.params['batch_size_initial'] paths, envsteps_this_batch, train_video_paths = ( self.collect_training_trajectories(itr, initial_expertdata, collect_policy, use_batchsize)) if (not self.agent.offline_exploitation) or ( self.agent.t <= self.agent.num_exploration_steps): self.total_envsteps += envsteps_this_batch # relabel the collected obs with actions from a provided expert policy if relabel_with_expert and itr >= start_relabel_with_expert: paths = self.do_relabel_with_expert(expert_policy, paths) # add collected data to replay buffer if isinstance(self.agent, ExplorationOrExploitationAgent): if (not self.agent.offline_exploitation) or ( self.agent.t <= self.agent.num_exploration_steps): self.agent.add_to_replay_buffer(paths) # train agent (using sampled data from replay buffer) if itr % print_period == 0: print("\nTraining agent...") all_logs = self.train_agent() # Log densities and output trajectories if isinstance(self.agent, ExplorationOrExploitationAgent) and ( itr % print_period == 0): self.dump_density_graphs(itr) # log/save if self.logvideo or self.logmetrics: # perform logging print('\nBeginning logging procedure...') if isinstance(self.agent, ExplorationOrExploitationAgent): self.perform_dqn_logging(all_logs) else: self.perform_logging(itr, paths, eval_policy, train_video_paths, all_logs) if self.params['save_params']: self.agent.save('{}/agent_itr_{}.pt'.format( self.params['logdir'], itr)) #################################### #################################### def collect_training_trajectories(self, itr, initial_expertdata, collect_policy, num_transitions_to_sample, save_expert_data_to_disk=False): """ :param itr: :param load_initial_expertdata: path to expert data pkl file :param collect_policy: the current policy using which we collect data :param num_transitions_to_sample: the number of transitions we collect :return: paths: a list trajectories envsteps_this_batch: the sum over the numbers of environment steps in paths train_video_paths: paths which also contain videos for visualization purposes """ # TODO: get this from hw1 or hw2 print("\nCollecting data to be used for training...") paths, envsteps_this_batch = utils.sample_trajectories( env=self.env, policy=collect_policy, min_timesteps_per_batch=num_transitions_to_sample, max_path_length=self.params['ep_len']) # collect more rollouts with the same policy, to be saved as videos in tensorboard # note: here, we collect MAX_NVIDEO rollouts, each of length MAX_VIDEO_LEN train_video_paths = None if self.logvideo: print( '\nCollecting train rollouts to be used for saving videos...') ## TODO look in utils and implement sample_n_trajectories train_video_paths = utils.sample_n_trajectories( self.env, collect_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) return paths, envsteps_this_batch, train_video_paths #################################### #################################### def train_agent(self): # TODO: get this from Piazza print('\nTraining agent using sampled data from replay buffer...') all_logs = [] for train_step in range(self.params['num_agent_train_steps_per_iter']): ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = self.agent.sample( self.params['train_batch_size']) # import ipdb; ipdb.set_trace() train_log = self.agent.train(ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch) all_logs.append(train_log) return all_logs #################################### #################################### def do_relabel_with_expert(self, expert_policy, paths): raise NotImplementedError # get this from hw1 or hw2 or ignore it b/c it's not used for this hw #################################### #################################### def perform_dqn_logging(self, all_logs): last_log = all_logs[-1] episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: self.mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward) logs = OrderedDict() logs["Train_EnvstepsSoFar"] = self.agent.t print("Timestep %d" % (self.agent.t, )) if self.mean_episode_reward > -5000: logs["Train_AverageReturn"] = np.mean(self.mean_episode_reward) print("mean reward (100 episodes) %f" % self.mean_episode_reward) if self.best_mean_episode_reward > -5000: logs["Train_BestReturn"] = np.mean(self.best_mean_episode_reward) print("best mean reward %f" % self.best_mean_episode_reward) if self.start_time is not None: time_since_start = (time.time() - self.start_time) print("running time %f" % time_since_start) logs["TimeSinceStart"] = time_since_start logs.update(last_log) eval_paths, eval_envsteps_this_batch = utils.sample_trajectories( self.eval_env, self.agent.eval_policy, self.params['eval_batch_size'], self.params['ep_len']) eval_returns = [eval_path["reward"].sum() for eval_path in eval_paths] eval_ep_lens = [len(eval_path["reward"]) for eval_path in eval_paths] logs["Eval_AverageReturn"] = np.mean(eval_returns) logs["Eval_StdReturn"] = np.std(eval_returns) logs["Eval_MaxReturn"] = np.max(eval_returns) logs["Eval_MinReturn"] = np.min(eval_returns) logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens) logs['Buffer size'] = self.agent.replay_buffer.num_in_buffer sys.stdout.flush() for key, value in logs.items(): print('{} : {}'.format(key, value)) self.logger.log_scalar(value, key, self.agent.t) print('Done logging...\n\n') self.logger.flush() def perform_logging(self, itr, paths, eval_policy, train_video_paths, all_logs): last_log = all_logs[-1] ####################### # collect eval trajectories, for logging print("\nCollecting data for eval...") eval_paths, eval_envsteps_this_batch = utils.sample_trajectories( self.env, eval_policy, self.params['eval_batch_size'], self.params['ep_len']) # save eval rollouts as videos in tensorboard event file if self.logvideo and train_video_paths != None: print('\nCollecting video rollouts eval') eval_video_paths = utils.sample_n_trajectories( self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) #save train/eval videos print('\nSaving train rollouts as videos...') self.logger.log_paths_as_videos(train_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title='train_rollouts') self.logger.log_paths_as_videos(eval_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title='eval_rollouts') ####################### # save eval metrics if self.logmetrics: # returns, for logging train_returns = [path["reward"].sum() for path in paths] eval_returns = [ eval_path["reward"].sum() for eval_path in eval_paths ] # episode lengths, for logging train_ep_lens = [len(path["reward"]) for path in paths] eval_ep_lens = [ len(eval_path["reward"]) for eval_path in eval_paths ] # decide what to log logs = OrderedDict() logs["Eval_AverageReturn"] = np.mean(eval_returns) logs["Eval_StdReturn"] = np.std(eval_returns) logs["Eval_MaxReturn"] = np.max(eval_returns) logs["Eval_MinReturn"] = np.min(eval_returns) logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens) logs["Train_AverageReturn"] = np.mean(train_returns) logs["Train_StdReturn"] = np.std(train_returns) logs["Train_MaxReturn"] = np.max(train_returns) logs["Train_MinReturn"] = np.min(train_returns) logs["Train_AverageEpLen"] = np.mean(train_ep_lens) logs["Train_EnvstepsSoFar"] = self.total_envsteps logs["TimeSinceStart"] = time.time() - self.start_time logs.update(last_log) if itr == 0: self.initial_return = np.mean(train_returns) logs["Initial_DataCollection_AverageReturn"] = self.initial_return # perform the logging for key, value in logs.items(): print('{} : {}'.format(key, value)) try: self.logger.log_scalar(value, key, itr) except: pdb.set_trace() print('Done logging...\n\n') self.logger.flush() def dump_density_graphs(self, itr): import matplotlib.pyplot as plt self.fig = plt.figure() filepath = lambda name: self.params['logdir'] + '/curr_{}.png'.format( name) num_states = self.agent.replay_buffer.num_in_buffer - 2 states = self.agent.replay_buffer.obs[:num_states] if num_states <= 0: return H, xedges, yedges = np.histogram2d(states[:, 0], states[:, 1], range=[[0., 1.], [0., 1.]], density=True) plt.imshow(np.rot90(H), interpolation='bicubic') plt.colorbar() plt.title('State Density') self.fig.savefig(filepath('state_density'), bbox_inches='tight') plt.clf() ii, jj = np.meshgrid(np.linspace(0, 1), np.linspace(0, 1)) obs = np.stack([ii.flatten(), jj.flatten()], axis=1) density = self.agent.exploration_model.forward_np(obs) density = density.reshape(ii.shape) plt.imshow(density[::-1]) plt.colorbar() plt.title('RND Value') self.fig.savefig(filepath('rnd_value'), bbox_inches='tight') plt.clf() exploitation_values = self.agent.exploitation_critic.qa_values( obs).mean(-1) exploitation_values = exploitation_values.reshape(ii.shape) plt.imshow(exploitation_values[::-1]) plt.colorbar() plt.title('Predicted Exploitation Value') self.fig.savefig(filepath('exploitation_value'), bbox_inches='tight') plt.clf() exploration_values = self.agent.exploration_critic.qa_values(obs).mean( -1) exploration_values = exploration_values.reshape(ii.shape) plt.imshow(exploration_values[::-1]) plt.colorbar() plt.title('Predicted Exploration Value') self.fig.savefig(filepath('exploration_value'), bbox_inches='tight')
class RL_Trainer(object): def __init__(self, params): ############# # INIT ############# # Get params, create logger, create TF session self.params = params self.logger = Logger(self.params['logdir']) self.sess = create_tf_session(self.params['use_gpu'], which_gpu=self.params['which_gpu']) # Set random seeds seed = self.params['seed'] tf.set_random_seed(seed) np.random.seed(seed) ############# # ENV ############# # Make the gym environment self.env = gym.make(self.params['env_name']) if 'env_wrappers' in self.params: # These operations are currently only for Atari envs self.env = wrappers.Monitor(self.env, os.path.join(self.params['logdir'], "gym"), force=True) self.env = params['env_wrappers'](self.env) self.mean_episode_reward = -float('nan') self.best_mean_episode_reward = -float('inf') self.env.seed(seed) # Maximum length for episodes self.params['ep_len'] = self.params[ 'ep_len'] or self.env.spec.max_episode_steps MAX_VIDEO_LEN = self.params['ep_len'] # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) # Are the observations images? img = len(self.env.observation_space.shape) > 2 self.params['agent_params']['discrete'] = discrete # Observation and action sizes ob_dim = self.env.observation_space.shape if img else self.env.observation_space.shape[ 0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[ 0] self.params['agent_params']['ac_dim'] = ac_dim self.params['agent_params']['ob_dim'] = ob_dim # simulation timestep, will be used for video saving if 'model' in dir(self.env): self.fps = 1 / self.env.model.opt.timestep elif 'env_wrappers' in self.params: self.fps = 30 # This is not actually used when using the Monitor wrapper else: self.fps = self.env.env.metadata['video.frames_per_second'] ############# # AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class(self.sess, self.env, self.params['agent_params']) ############# # INIT VARS ############# tf.global_variables_initializer().run(session=self.sess) def run_training_loop(self, n_iter, collect_policy, eval_policy, initial_expertdata=None, relabel_with_expert=False, start_relabel_with_expert=1, expert_policy=None): """ :param n_iter: number of (dagger) iterations :param collect_policy: :param eval_policy: :param initial_expertdata: :param relabel_with_expert: whether to perform dagger :param start_relabel_with_expert: iteration at which to start relabel with expert :param expert_policy: """ # init vars at beginning of training self.total_envsteps = 0 self.start_time = time.time() for itr in range(n_iter): # print("\n\n********** Iteration %i ************" % itr) # decide if videos should be rendered/logged at this iteration if itr % self.params['video_log_freq'] == 0 and self.params[ 'video_log_freq'] != -1: self.log_video = True else: self.log_video = False # decide if metrics should be logged if self.params['scalar_log_freq'] == -1: self.logmetrics = False elif itr % self.params['scalar_log_freq'] == 0: self.logmetrics = True else: self.logmetrics = False # collect trajectories, to be used for training if isinstance(self.agent, DQNAgent): # only perform an env step and add to replay buffer for DQN self.agent.step_env() envsteps_this_batch = 1 train_video_paths = None paths = None else: paths, envsteps_this_batch, train_video_paths = self.collect_training_trajectories( itr, initial_expertdata, collect_policy, self.params['batch_size']) self.total_envsteps += envsteps_this_batch # relabel the collected obs with actions from a provided expert policy if relabel_with_expert and itr >= start_relabel_with_expert: paths = self.do_relabel_with_expert(expert_policy, paths) # add collected data to replay buffer self.agent.add_to_replay_buffer(paths) # train agent (using sampled data from replay buffer) loss = self.train_agent() # log/save if self.log_video or self.logmetrics: # perform logging print('\nBeginning logging procedure...') if isinstance(self.agent, DQNAgent): self.perform_dqn_logging() else: self.perform_logging(itr, paths, eval_policy, train_video_paths, loss) # save policy if self.params['save_params']: print('\nSaving agent\'s actor...') self.agent.actor.save(self.params['logdir'] + '/policy_itr_' + str(itr)) self.agent.critic.save(self.params['logdir'] + '/critic_itr_' + str(itr)) #################################### #################################### def collect_training_trajectories(self, itr, load_initial_expertdata, collect_policy, batch_size): # TODO: GETTHIS from HW1 """ :param itr: :param load_initial_expertdata: path to expert data pkl file :param collect_policy: the current policy using which we collect data :param batch_size: the number of transitions we collect :return: paths: a list trajectories envsteps_this_batch: the sum over the numbers of environment steps in paths train_video_paths: paths which also contain videos for visualization purposes """ # TODO decide whether to load training data or use # HINT: depending on if it's the first iteration or not, # decide whether to either # load the data. In this case you can directly return as follows # ``` return loaded_paths, 0, None ``` # collect data, batch_size is the number of transitions you want to collect. if not itr and load_initial_expertdata: with open(load_initial_expertdata, 'rb') as f: initial_expert_data = pickle.load(f) return initial_expert_data, 0, None # TODO collect data to be used for training # HINT1: use sample_trajectories from utils # HINT2: you want each of these collected rollouts to be of length self.params['ep_len'] # print("\nCollecting data to be used for training...") paths, envsteps_this_batch = sample_trajectories( self.env, collect_policy, batch_size, max_path_length=self.params['ep_len']) # TODO # collect more rollouts with the same policy, to be saved as videos in tensorboard # note: here, we collect MAX_NVIDEO rollouts, each of length MAX_VIDEO_LEN train_video_paths = None if self.log_video: print( '\nCollecting train rollouts to be used for saving videos...') # TODO look in utils and implement sample_n_trajectories train_video_paths = sample_n_trajectories(self.env, collect_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) return paths, envsteps_this_batch, train_video_paths def train_agent(self): # TODO: GETTHIS from HW1 # print('\nTraining agent using sampled data from replay buffer...') for train_step in range(self.params['num_agent_train_steps_per_iter']): # TODO sample some data from the data buffer # HINT1: use the agent's sample function # HINT2: how much data = self.params['train_batch_size'] # ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = None #TODO sampled_data = self.agent.sample(self.params['train_batch_size']) # TODO use the sampled data for training # HINT: use the agent's train function # HINT: print or plot the loss for debugging! loss = self.agent.train(*sampled_data) # self.training_loss += [loss] # print(f'loss {loss}') return loss def do_relabel_with_expert(self, expert_policy, paths): # TODO: GETTHIS from HW1 (although you don't actually need it for this homework) print( "\nRelabelling collected observations with labels from an expert policy..." ) # TODO relabel collected obsevations (from our policy) with labels from an expert policy # HINT: query the policy (using the get_action function) with paths[i]["observation"] # and replace paths[i]["action"] with these expert labels for path in paths: path['action'] = expert_policy.get_action(path['observation']) return paths #################################### #################################### def perform_dqn_logging(self): episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards() episode_rewards_len = len(episode_rewards) if episode_rewards_len > 0: self.mean_episode_reward = np.mean(episode_rewards[-100:]) if episode_rewards_len > 100: self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward) logs = OrderedDict() logs["Train_EnvstepsSoFar"] = self.agent.t print("Timestep %d" % (self.agent.t, )) if self.mean_episode_reward > -5000: logs["Train_AverageReturn"] = np.mean(self.mean_episode_reward) print("mean reward (100 episodes) %f" % self.mean_episode_reward) if self.best_mean_episode_reward > -5000: logs["Train_BestReturn"] = np.mean(self.best_mean_episode_reward) print("best mean reward %f" % self.best_mean_episode_reward) print(f'episode len: {episode_rewards_len}') if self.start_time is not None: time_since_start = (time.time() - self.start_time) print("running time %f" % time_since_start) logs["TimeSinceStart"] = time_since_start sys.stdout.flush() for key, value in logs.items(): print('{} : {}'.format(key, value)) self.logger.log_scalar(value, key, self.agent.t) print('Done logging...\n\n') self.logger.flush() def perform_logging(self, itr, paths, eval_policy, train_video_paths, loss): # collect eval trajectories, for logging print("\nCollecting data for eval...") eval_paths, eval_envsteps_this_batch = sample_trajectories( self.env, eval_policy, self.params['eval_batch_size'], self.params['ep_len']) # save eval rollouts as videos in tensorboard event file if self.log_video and train_video_paths is not None: print('\nCollecting video rollouts eval') eval_video_paths = sample_n_trajectories(self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) # save train/eval videos print('\nSaving train rollouts as videos...') self.logger.log_paths_as_videos(train_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title='train_rollouts') self.logger.log_paths_as_videos(eval_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title='eval_rollouts') # save eval metrics if self.logmetrics: # returns, for logging train_returns = [path["reward"].sum() for path in paths] eval_returns = [ eval_path["reward"].sum() for eval_path in eval_paths ] # episode lengths, for logging train_ep_lens = [len(path["reward"]) for path in paths] eval_ep_lens = [ len(eval_path["reward"]) for eval_path in eval_paths ] # decide what to log logs = OrderedDict() logs["Eval_AverageReturn"] = np.mean(eval_returns) logs["Eval_StdReturn"] = np.std(eval_returns) logs["Eval_MaxReturn"] = np.max(eval_returns) logs["Eval_MinReturn"] = np.min(eval_returns) logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens) logs["Train_AverageReturn"] = np.mean(train_returns) logs["Train_StdReturn"] = np.std(train_returns) logs["Train_MaxReturn"] = np.max(train_returns) logs["Train_MinReturn"] = np.min(train_returns) logs["Train_AverageEpLen"] = np.mean(train_ep_lens) logs["Train_EnvstepsSoFar"] = self.total_envsteps logs["TimeSinceStart"] = time.time() - self.start_time if isinstance(loss, dict): logs.update(loss) else: logs["Training loss"] = loss if itr == 0: self.initial_return = np.mean(train_returns) logs["Initial_DataCollection_AverageReturn"] = self.initial_return # perform the logging for key, value in logs.items(): print('{} : {}'.format(key, value)) self.logger.log_scalar(value, key, itr) print('Done logging...\n\n') self.logger.flush()
class RL_Trainer(object): def __init__(self, params): ############# ## INIT ############# # Get params, create logger self.params = params self.logger = Logger(self.params["logdir"]) # Set random seeds seed = self.params["seed"] np.random.seed(seed) torch.manual_seed(seed) ptu.init_gpu(use_gpu=not self.params["no_gpu"], gpu_id=self.params["which_gpu"]) ############# ## ENV ############# # Make the gym environment self.env = gym.make(self.params["env_name"]) self.env.seed(seed) # import plotting (locally if 'obstacles' env) if not (self.params["env_name"] == "obstacles-cs285-v0"): import matplotlib matplotlib.use("Agg") # Maximum length for episodes self.params["ep_len"] = self.params[ "ep_len"] or self.env.spec.max_episode_steps global MAX_VIDEO_LEN MAX_VIDEO_LEN = self.params["ep_len"] # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) # Are the observations images? img = len(self.env.observation_space.shape) > 2 self.params["agent_params"]["discrete"] = discrete # Observation and action sizes ob_dim = (self.env.observation_space.shape if img else self.env.observation_space.shape[0]) ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[ 0] self.params["agent_params"]["ac_dim"] = ac_dim self.params["agent_params"]["ob_dim"] = ob_dim # simulation timestep, will be used for video saving if "model" in dir(self.env): self.fps = 1 / self.env.model.opt.timestep elif "env_wrappers" in self.params: self.fps = 30 # This is not actually used when using the Monitor wrapper elif "video.frames_per_second" in self.env.env.metadata.keys(): self.fps = self.env.env.metadata["video.frames_per_second"] else: self.fps = 10 ############# ## AGENT ############# agent_class = self.params["agent_class"] self.agent = agent_class(self.env, self.params["agent_params"]) def run_training_loop( self, n_iter, collect_policy, eval_policy, initial_expertdata=None, relabel_with_expert=False, start_relabel_with_expert=1, expert_policy=None, ): """ :param n_iter: number of (dagger) iterations :param collect_policy: :param eval_policy: :param initial_expertdata: :param relabel_with_expert: whether to perform dagger :param start_relabel_with_expert: iteration at which to start relabel with expert :param expert_policy: """ # init vars at beginning of training self.total_envsteps = 0 self.start_time = time.time() for itr in trange(n_iter): print("\n\n********** Iteration %i ************" % itr) # decide if videos should be rendered/logged at this iteration if (itr % self.params["video_log_freq"] == 0 and self.params["video_log_freq"] != -1): self.logvideo = True else: self.logvideo = False self.log_video = self.logvideo # decide if metrics should be logged if self.params["scalar_log_freq"] == -1: self.logmetrics = False elif itr % self.params["scalar_log_freq"] == 0: self.logmetrics = True else: self.logmetrics = False # collect trajectories, to be used for training training_returns = self.collect_training_trajectories( itr, initial_expertdata, collect_policy, self.params["batch_size"]) paths, envsteps_this_batch, train_video_paths = training_returns self.total_envsteps += envsteps_this_batch # add collected data to replay buffer self.agent.add_to_replay_buffer(paths) # train agent (using sampled data from replay buffer) train_logs = self.train_agent() # log/save if self.logvideo or self.logmetrics: # perform logging print("\nBeginning logging procedure...") self.perform_logging(itr, paths, eval_policy, train_video_paths, train_logs) if self.params["save_params"]: self.agent.save("{}/agent_itr_{}.pt".format( self.params["logdir"], itr)) #################################### #################################### def collect_training_trajectories(self, itr, load_initial_expertdata, collect_policy, batch_size): # TODO: get this from hw1 # if your load_initial_expertdata is None, then you need to collect new trajectories at *every* iteration """ :param itr: :param load_initial_expertdata: path to expert data pkl file :param collect_policy: the current policy using which we collect data :param batch_size: the number of transitions we collect :return: paths: a list trajectories envsteps_this_batch: the sum over the numbers of environment steps in paths train_video_paths: paths which also contain videos for visualization purposes """ # TODO decide whether to load training data or use the current policy to collect more data # HINT: depending on if it's the first iteration or not, decide whether to either # (1) load the data. In this case you can directly return as follows # ``` return loaded_paths, 0, None ``` # (2) collect `self.params['batch_size']` transitions if load_initial_expertdata is not None and itr == 0: with open(load_initial_expertdata, "rb") as f: loaded_paths = pickle.loads(f.read()) return loaded_paths, 0, None # TODO collect `batch_size` samples to be used for training # HINT1: use sample_trajectories from utils # HINT2: you want each of these collected rollouts to be of length self.params['ep_len'] print("\nCollecting data to be used for training...") paths, envsteps_this_batch = utils.sample_trajectories( self.env, collect_policy, batch_size, self.params["ep_len"]) # collect more rollouts with the same policy, to be saved as videos in tensorboard # note: here, we collect MAX_NVIDEO rollouts, each of length MAX_VIDEO_LEN train_video_paths = None if self.log_video: print( "\nCollecting train rollouts to be used for saving videos...") ## TODO look in utils and implement sample_n_trajectories train_video_paths = utils.sample_n_trajectories( self.env, collect_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) return paths, envsteps_this_batch, train_video_paths def train_agent(self): # TODO: get this from hw1 print("\nTraining agent using sampled data from replay buffer...") all_logs = [] for train_step in range(self.params["num_agent_train_steps_per_iter"]): # TODO sample some data from the data buffer # HINT1: use the agent's sample function # HINT2: how much data = self.params['train_batch_size'] ( ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch, ) = self.agent.sample(self.params["train_batch_size"]) # TODO use the sampled data to train an agent # HINT: use the agent's train function # HINT: keep the agent's training log for debugging train_log = self.agent.train(ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch) all_logs.append(train_log) return all_logs #################################### #################################### def perform_logging(self, itr, paths, eval_policy, train_video_paths, all_logs): last_log = all_logs[-1] ####################### # collect eval trajectories, for logging print("\nCollecting data for eval...") eval_paths, eval_envsteps_this_batch = utils.sample_trajectories( self.env, eval_policy, self.params["eval_batch_size"], self.params["ep_len"]) # save eval rollouts as videos in tensorboard event file if self.logvideo and train_video_paths != None: print("\nCollecting video rollouts eval") eval_video_paths = utils.sample_n_trajectories( self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) # save train/eval videos print("\nSaving train rollouts as videos...") self.logger.log_paths_as_videos( train_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title="train_rollouts", ) self.logger.log_paths_as_videos( eval_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title="eval_rollouts", ) ####################### # save eval metrics if self.logmetrics: # returns, for logging train_returns = [path["reward"].sum() for path in paths] eval_returns = [ eval_path["reward"].sum() for eval_path in eval_paths ] # episode lengths, for logging train_ep_lens = [len(path["reward"]) for path in paths] eval_ep_lens = [ len(eval_path["reward"]) for eval_path in eval_paths ] # decide what to log logs = OrderedDict() logs["Eval_AverageReturn"] = np.mean(eval_returns) logs["Eval_StdReturn"] = np.std(eval_returns) logs["Eval_MaxReturn"] = np.max(eval_returns) logs["Eval_MinReturn"] = np.min(eval_returns) logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens) logs["Train_AverageReturn"] = np.mean(train_returns) logs["Train_StdReturn"] = np.std(train_returns) logs["Train_MaxReturn"] = np.max(train_returns) logs["Train_MinReturn"] = np.min(train_returns) logs["Train_AverageEpLen"] = np.mean(train_ep_lens) logs["Train_EnvstepsSoFar"] = self.total_envsteps logs["TimeSinceStart"] = time.time() - self.start_time logs.update(last_log) if itr == 0: self.initial_return = np.mean(train_returns) logs["Initial_DataCollection_AverageReturn"] = self.initial_return # perform the logging for key, value in logs.items(): print("{} : {}".format(key, value)) self.logger.log_scalar(value, key, itr) print("Done logging...\n\n") self.logger.flush()
def __init__(self, params): ############# ## INIT ############# # Get params, create logger self.params = params self.logger = Logger(self.params['logdir']) # Set random seeds seed = self.params['seed'] np.random.seed(seed) torch.manual_seed(seed) ptu.init_gpu(use_gpu=not self.params['no_gpu'], gpu_id=self.params['which_gpu']) ############# ## ENV ############# # Make the gym environment #register_custom_envs() self.env = City((self.params['width'], self.params['height']), self.params['n_drivers'], self.params['n_restaurants']) """ if 'env_wrappers' in self.params: # These operations are currently only for Atari envs self.env = wrappers.Monitor(self.env, os.path.join(self.params['logdir'], "gym"), force=True) self.env = params['env_wrappers'](self.env) self.mean_episode_reward = -float('nan') self.best_mean_episode_reward = -float('inf') if 'non_atari_colab_env' in self.params and self.params['video_log_freq'] > 0: self.env = wrappers.Monitor(self.env, os.path.join(self.params['logdir'], "gym"), force=True) self.mean_episode_reward = -float('nan') self.best_mean_episode_reward = -float('inf') """ self.env.seed(seed) # import plotting (locally if 'obstacles' env) if not (self.params['env_name'] == 'obstacles-cs285-v0'): import matplotlib matplotlib.use('Agg') # Maximum length for episodes self.params['ep_len'] = self.params[ 'ep_len'] or self.env.spec.max_episode_steps global MAX_VIDEO_LEN MAX_VIDEO_LEN = self.params['ep_len'] # Is this env multi binary, or self.discrete? #multi_bi = isinstance(self.env.action_space, gym.spaces.MultiBinary) is_city = True # Are the observations images? img = False self.params['agent_params']['is_city'] = is_city # Observation and action sizes #ob_dim = self.env.observation_space.shape if img else self.env.observation_space.shape[0] #ac_dim = self.env.action_space.n if multi_bi else self.env.action_space.shape[0] #ob_dim = self.env.observation_space.shape[0] #ac_dim = self.env.action_space.shape[0] self.params['agent_params']['n_drivers'] = self.params['n_drivers'] self.params['agent_params']['ac_dim'] = self.params['n_drivers'] self.params['agent_params']['ob_dim'] = (self.params['n_drivers'], (3 + 2 * MAX_CAP + 5 + 5 * MAX_CAND_NUM)) self.params['agent_params']['shared_exp'] = self.params['shared_exp'] self.params['agent_params']['shared_exp_lambda'] = self.params[ 'shared_exp_lambda'] self.params['agent_params']['size_ac'] = self.params['size_ac'] self.params['agent_params']['size_cr'] = self.params['size_cr'] # simulation timestep, will be used for video saving #if 'model' in dir(self.env): # self.fps = 1/self.env.model.opt.timestep #elif 'env_wrappers' in self.params: # self.fps = 30 # This is not actually used when using the Monitor wrapper #elif 'video.frames_per_second' in self.env.env.metadata.keys(): # self.fps = self.env.env.metadata['video.frames_per_second'] #else: # self.fps = 10 ############# ## AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class(self.env, self.params['agent_params'])
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument( '--expert_policy_file', '-epf', type=str, required=True) # relative to where you're running this script from parser.add_argument( '--expert_data', '-ed', type=str, required=True) #relative to where you're running this script from parser.add_argument( '--env_name', '-env', type=str, help= 'choices: Ant-v2, Humanoid-v2, Walker-v2, HalfCheetah-v2, Hopper-v2', required=True) parser.add_argument('--exp_name', '-exp', type=str, default='pick an experiment name', required=True) parser.add_argument('--do_dagger', action='store_true') parser.add_argument('--ep_len', type=int) parser.add_argument( '--num_agent_train_steps_per_iter', type=int, default=1000 ) # number of gradient steps for training policy (per iter in n_iter) parser.add_argument('--n_iter', '-n', type=int, default=1) parser.add_argument( '--batch_size', type=int, default=1000 ) # training data collected (in the env) during each iteration parser.add_argument( '--eval_batch_size', type=int, default=200) # eval data collected (in the env) for logging metrics parser.add_argument( '--train_batch_size', type=int, default=100 ) # number of sampled data points to be used per gradient/train step parser.add_argument('--n_layers', type=int, default=2) # depth, of policy to be learned parser.add_argument( '--size', type=int, default=64) # width of each layer, of policy to be learned parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3) # LR for supervised learning parser.add_argument('--video_log_freq', type=int, default=5) parser.add_argument('--scalar_log_freq', type=int, default=1) parser.add_argument('--use_gpu', action='store_true') parser.add_argument('--which_gpu', type=int, default=0) parser.add_argument('--max_replay_buffer_size', type=int, default=1000000) parser.add_argument('--seed', type=int, default=1) args = parser.parse_args() # convert args to dictionary params = vars(args) ################################## ### CREATE DIRECTORY FOR LOGGING ################################## logdir_prefix = 'bc_' if args.do_dagger: logdir_prefix = 'dagger_' assert args.n_iter > 1, ( 'DAGGER needs more than 1 iteration (n_iter>1) of training, to iteratively query the expert and train (after 1st warmstarting from behavior cloning).' ) else: assert args.n_iter == 1, ( 'Vanilla behavior cloning collects expert data just once (n_iter=1)' ) ## directory for logging data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data') if not (os.path.exists(data_path)): os.makedirs(data_path) logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_section_2' logdir = os.path.join(data_path, logdir) params['logdir'] = logdir if not (os.path.exists(logdir)): os.makedirs(logdir) ################### ### RUN TRAINING ################### logger = Logger(params['logdir']) returns = [] trainer = BC_Trainer(params, logger) log = trainer.run_training_loop() trainer.run_logging_loop(10) logger.flush()
class RL_Trainer(object): def __init__(self, params): ##INIT self.params = params self.logger = Logger(self.params['logdir']) #TODO LOGGER seed = self.params['seed'] np.random.seed(seed) ##ENV self.env = gym.make(self.params['env_name']) self.env.seed(seed) #max length of episodes self.params['ep_len'] = self.params[ 'ep_len'] or self.env.spec.max_episode_steps MAX_VIDEO_LEN = self.params['ep_len'] #Check discrete or continuous discrete = isinstance(self.env.action_space, gym.spaces.Discrete) self.params['agent_params']['discrete'] = discrete ob_dim = self.env.observation_space.shape[0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[ 0] self.params['agent_params']['ac_dim'] = ac_dim self.params['agent_params']['ob_dim'] = ob_dim #video save if 'model' in dir(self.env): self.fps = 1 / self.env.model.opt.timestep #what is model #else mostly I guess else: self.fps = self.env.env.metadata['video.frames_per_second'] ##AGENT agent_class = self.params['agent_class'] self.agent = agent_class(self.env, self.params['agent_params']) def run_training_loop(self, n_iter, collect_policy, eval_policy): self.total_envsteps = 0 self.start_time = time.time() for itr in range(n_iter): print("\n\n********** Iteration %i ************" % itr) if itr % self.params['video_log_freq'] == 0 and self.params[ 'video_log_freq'] != -1: self.log_video = True else: self.log_video = False # decide if metrics should be logged if itr % self.params['scalar_log_freq'] == 0: self.log_metrics = True else: self.log_metrics = False training_returns = self.collect_training_trajectories( itr, collect_policy, self.params['batch_size']) paths, envsteps_this_batch, train_video_paths = training_returns self.total_envsteps += envsteps_this_batch self.agent.add_to_replay_buffer(paths) self.train_agent() if self.log_video or self.log_metrics: # perform logging print('\nBeginning logging procedure...') self.perform_logging(itr, paths, eval_policy, train_video_paths) if self.params['save_params']: torch.save( { 'epoch': itr, 'model_state_dict': self.agent.actor.pgpolicy.state_dict(), 'optimizer_state_dict': self.agent.actor.optimizer.state_dict(), 'loss': self.agent.actor.loss }, self.params['logdir'] + '/policy_itr_' + str(itr)) torch.save( { 'epoch': itr, 'model_state_dict': self.agent.actor.nnpolicy.state_dict(), 'optimizer_state_dict': self.agent.actor.nnoptimizer.state_dict(), 'loss': self.agent.actor.baseline_loss }, self.params['logdir'] + '/nnpolicy_itr_' + str(itr)) def collect_training_trajectories(self, itr, collect_policy, batch_size): print("\nCollecting data to be used for training...") paths, envsteps_this_batch = sample_trajectories( self.env, collect_policy, batch_size * self.params['ep_len'], self.params['ep_len']) train_video_paths = None if self.log_video: print( '\nCollecting train rollouts to be used for saving videos...') ## TODO look in utils and implement sample_n_trajectories train_video_paths = sample_n_trajectories(self.env, collect_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) return paths, envsteps_this_batch, train_video_paths def train_agent(self): print('\nTraining agent using sampled data from replay buffer...') for train_step in range(self.params['num_agent_train_steps_per_iter']): ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = self.agent.sample( self.params['train_batch_size']) print("obs shape:{0}".format(ob_batch.shape)) print("action shape:{0}".format(ac_batch.shape)) self.agent.train(ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch) if train_step % 100 == 0: print('\n Print loss for train steps:{0} is {1}'.format( train_step, self.agent.actor.loss)) def perform_logging(self, itr, paths, eval_policy, train_video_paths): # collect eval trajectories, for logging print("\nCollecting data for eval...") eval_paths, eval_envsteps_this_batch = sample_trajectories( self.env, eval_policy, self.params['eval_batch_size'], self.params['ep_len']) # save eval rollouts as videos in tensorboard event file if self.log_video and train_video_paths != None: print('\nCollecting video rollouts eval') eval_video_paths = sample_n_trajectories(self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) #save train/eval videos print('\nSaving train rollouts as videos...') self.logger.log_paths_as_videos(train_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title='train_rollouts') self.logger.log_paths_as_videos(eval_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title='eval_rollouts') # save eval metrics if self.log_metrics: # returns, for logging train_returns = [path["reward"].sum() for path in paths] eval_returns = [ eval_path["reward"].sum() for eval_path in eval_paths ] # episode lengths, for logging train_ep_lens = [len(path["reward"]) for path in paths] eval_ep_lens = [ len(eval_path["reward"]) for eval_path in eval_paths ] # decide what to log logs = OrderedDict() logs["Eval_AverageReturn"] = np.mean(eval_returns) logs["Eval_StdReturn"] = np.std(eval_returns) logs["Eval_MaxReturn"] = np.max(eval_returns) logs["Eval_MinReturn"] = np.min(eval_returns) logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens) logs["Train_AverageReturn"] = np.mean(train_returns) logs["Train_StdReturn"] = np.std(train_returns) logs["Train_MaxReturn"] = np.max(train_returns) logs["Train_MinReturn"] = np.min(train_returns) logs["Train_AverageEpLen"] = np.mean(train_ep_lens) logs["Train_EnvstepsSoFar"] = self.total_envsteps logs["TimeSinceStart"] = time.time() - self.start_time if itr == 0: self.initial_return = np.mean(train_returns) logs["Initial_DataCollection_AverageReturn"] = self.initial_return # perform the logging for key, value in logs.items(): print('{} : {}'.format(key, value)) self.logger.log_scalar(value, key, itr) print('Done logging...\n\n') self.logger.flush()
class RL_Trainer(object): def __init__(self, params): ############# ## INIT ############# # Get params, create logger, create TF session self.params = params self.logger = Logger(self.params['logdir']) self.sess = create_tf_session(self.params['use_gpu'], which_gpu=self.params['which_gpu']) # Set random seeds seed = self.params['seed'] tf.set_random_seed(seed) np.random.seed(seed) ############# ## ENV ############# # Make the gym environment self.env = gym.make(self.params['env_name']) if 'env_wrappers' in self.params: # These operations are currently only for Atari envs self.env = wrappers.Monitor(self.env, os.path.join(self.params['logdir'], "gym"), force=True) self.env = params['env_wrappers'](self.env) self.mean_episode_reward = -float('nan') self.best_mean_episode_reward = -float('inf') self.env.seed(seed) # import plotting (locally if 'obstacles' env) if not (self.params['env_name'] == 'obstacles-cs285-v0'): import matplotlib matplotlib.use('Agg') # Maximum length for episodes self.params['ep_len'] = self.params[ 'ep_len'] or self.env.spec.max_episode_steps global MAX_VIDEO_LEN MAX_VIDEO_LEN = self.params['ep_len'] # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) # Are the observations images? img = len(self.env.observation_space.shape) > 2 self.params['agent_params']['discrete'] = discrete # Observation and action sizes ob_dim = self.env.observation_space.shape if img else self.env.observation_space.shape[ 0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[ 0] self.params['agent_params']['ac_dim'] = ac_dim self.params['agent_params']['ob_dim'] = ob_dim # simulation timestep, will be used for video saving if 'model' in dir(self.env): self.fps = 1 / self.env.model.opt.timestep elif 'env_wrappers' in self.params: self.fps = 30 # This is not actually used when using the Monitor wrapper elif 'video.frames_per_second' in self.env.env.metadata.keys(): self.fps = self.env.env.metadata['video.frames_per_second'] else: self.fps = 10 ############# ## AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class(self.sess, self.env, self.params['agent_params']) ############# ## INIT VARS ############# tf.global_variables_initializer().run(session=self.sess) def run_training_loop(self, n_iter, collect_policy, eval_policy, initial_expertdata=None, relabel_with_expert=False, start_relabel_with_expert=1, expert_policy=None): """ :param n_iter: number of (dagger) iterations :param collect_policy: :param eval_policy: :param initial_expertdata: :param relabel_with_expert: whether to perform dagger :param start_relabel_with_expert: iteration at which to start relabel with expert :param expert_policy: """ # init vars at beginning of training self.total_envsteps = 0 self.start_time = time.time() for itr in range(n_iter): print("\n\n********** Iteration %i ************" % itr) # decide if videos should be rendered/logged at this iteration if itr % self.params['video_log_freq'] == 0 and self.params[ 'video_log_freq'] != -1: self.logvideo = True else: self.logvideo = False # decide if metrics should be logged if self.params['scalar_log_freq'] == -1: self.logmetrics = False elif itr % self.params['scalar_log_freq'] == 0: self.logmetrics = True else: self.logmetrics = False # collect trajectories, to be used for training if isinstance(self.agent, DQNAgent): # only perform an env step and add to replay buffer for DQN self.agent.step_env() envsteps_this_batch = 1 train_video_paths = None paths = None else: use_batchsize = self.params['batch_size'] if itr == 0: use_batchsize = self.params['batch_size_initial'] paths, envsteps_this_batch, train_video_paths = self.collect_training_trajectories( itr, initial_expertdata, collect_policy, use_batchsize) self.total_envsteps += envsteps_this_batch # relabel the collected obs with actions from a provided expert policy if relabel_with_expert and itr >= start_relabel_with_expert: paths = self.do_relabel_with_expert(expert_policy, paths) # add collected data to replay buffer self.agent.add_to_replay_buffer(paths, self.params['add_sl_noise']) # train agent (using sampled data from replay buffer) all_losses = self.train_agent() if self.params['logdir'].split('/')[-1][:2] == 'mb' and itr == 0: self.log_model_predictions(itr, all_losses) # log/save if self.logvideo or self.logmetrics: # perform logging print('\nBeginning logging procedure...') if isinstance(self.agent, DQNAgent): self.perform_dqn_logging() else: self.perform_logging(itr, paths, eval_policy, train_video_paths, all_losses) # save policy if self.params['save_params']: print('\nSaving agent\'s actor...') if 'actor' in dir(self.agent): self.agent.actor.save(self.params['logdir'] + '/policy_itr_' + str(itr)) if 'critic' in dir(self.agent): self.agent.critic.save(self.params['logdir'] + '/critic_itr_' + str(itr)) #################################### #################################### def collect_training_trajectories(self, itr, load_initial_expertdata, collect_policy, batch_size): # TODO decide whether to load training data or use # HINT: depending on if it's the first iteration or not, # decide whether to either # load the data. In this case you can directly return as follows # ``` return loaded_paths, 0, None ``` # collect data, batch_size is the number of transitions you want to collect. if itr == 0 and load_initial_expertdata is not None: print(load_initial_expertdata) with open(load_initial_expertdata, "rb") as f: loaded_paths = pickle.load(f) return loaded_paths, 0, None # TODO collect data to be used for training # HINT1: use sample_trajectories from utils # HINT2: you want each of these collected rollouts to be of length self.params['ep_len'] print("\nCollecting data to be used for training...") paths, envsteps_this_batch = sample_trajectories( self.env, collect_policy, batch_size, self.params['ep_len']) # collect more rollouts with the same policy, to be saved as videos in tensorboard # note: here, we collect MAX_NVIDEO rollouts, each of length MAX_VIDEO_LEN train_video_paths = None if self.logvideo: print( '\nCollecting train rollouts to be used for saving videos...') ## TODO look in utils and implement sample_n_trajectories train_video_paths = sample_n_trajectories(self.env, collect_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) return paths, envsteps_this_batch, train_video_paths def train_agent(self): print('\nTraining agent using sampled data from replay buffer...') all_losses = [] for train_step in range(self.params['num_agent_train_steps_per_iter']): # TODO sample some data from the data buffer # HINT1: use the agent's sample function # HINT2: how much data = self.params['train_batch_size'] ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = self.agent.sample( self.params['train_batch_size']) # TODO use the sampled data for training # HINT: use the agent's train function # HINT: print or plot the loss for debugging! all_losses.append( self.agent.train(ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch)) return all_losses def do_relabel_with_expert(self, expert_policy, paths): # TODO: GETTHIS from HW1 (although you don't actually need it for this homework) pass #################################### #################################### def perform_dqn_logging(self): episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: self.mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward) logs = OrderedDict() logs["Train_EnvstepsSoFar"] = self.agent.t print("Timestep %d" % (self.agent.t, )) if self.mean_episode_reward > -5000: logs["Train_AverageReturn"] = np.mean(self.mean_episode_reward) print("mean reward (100 episodes) %f" % self.mean_episode_reward) if self.best_mean_episode_reward > -5000: logs["Train_BestReturn"] = np.mean(self.best_mean_episode_reward) print("best mean reward %f" % self.best_mean_episode_reward) if self.start_time is not None: time_since_start = (time.time() - self.start_time) print("running time %f" % time_since_start) logs["TimeSinceStart"] = time_since_start sys.stdout.flush() for key, value in logs.items(): print('{} : {}'.format(key, value)) self.logger.log_scalar(value, key, self.agent.t) print('Done logging...\n\n') self.logger.flush() def perform_logging(self, itr, paths, eval_policy, train_video_paths, all_losses): loss = all_losses[-1] # collect eval trajectories, for logging print("\nCollecting data for eval...") eval_paths, eval_envsteps_this_batch = sample_trajectories( self.env, eval_policy, self.params['eval_batch_size'], self.params['ep_len']) # save eval rollouts as videos in tensorboard event file if self.logvideo and train_video_paths != None: print('\nCollecting video rollouts eval') eval_video_paths = sample_n_trajectories(self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) #save train/eval videos print('\nSaving train rollouts as videos...') self.logger.log_paths_as_videos(train_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title='train_rollouts') self.logger.log_paths_as_videos(eval_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title='eval_rollouts') # save eval metrics if self.logmetrics: # returns, for logging train_returns = [path["reward"].sum() for path in paths] eval_returns = [ eval_path["reward"].sum() for eval_path in eval_paths ] # episode lengths, for logging train_ep_lens = [len(path["reward"]) for path in paths] eval_ep_lens = [ len(eval_path["reward"]) for eval_path in eval_paths ] # decide what to log logs = OrderedDict() logs["Eval_AverageReturn"] = np.mean(eval_returns) logs["Eval_StdReturn"] = np.std(eval_returns) logs["Eval_MaxReturn"] = np.max(eval_returns) logs["Eval_MinReturn"] = np.min(eval_returns) logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens) logs["Train_AverageReturn"] = np.mean(train_returns) logs["Train_StdReturn"] = np.std(train_returns) logs["Train_MaxReturn"] = np.max(train_returns) logs["Train_MinReturn"] = np.min(train_returns) logs["Train_AverageEpLen"] = np.mean(train_ep_lens) logs["Train_EnvstepsSoFar"] = self.total_envsteps logs["TimeSinceStart"] = time.time() - self.start_time if isinstance(loss, dict): logs.update(loss) else: logs["Training loss"] = loss if itr == 0: self.initial_return = np.mean(train_returns) logs["Initial_DataCollection_AverageReturn"] = self.initial_return # perform the logging for key, value in logs.items(): print('{} : {}'.format(key, value)) self.logger.log_scalar(value, key, itr) print('Done logging...\n\n') self.logger.flush() def log_model_predictions(self, itr, all_losses): # model predictions import matplotlib.pyplot as plt self.fig = plt.figure() # sample actions action_sequence = self.agent.actor.sample_action_sequences( num_sequences=1, horizon=10) #20 reacher action_sequence = action_sequence[0] # calculate and log model prediction error mpe, true_states, pred_states = calculate_mean_prediction_error( self.env, action_sequence, self.agent.dyn_models, self.agent.actor.data_statistics) assert self.params['agent_params']['ob_dim'] == true_states.shape[ 1] == pred_states.shape[1] ob_dim = self.params['agent_params']['ob_dim'] # skip last state for plotting when state dim is odd if ob_dim % 2 == 1: ob_dim -= 1 # plot the predictions self.fig.clf() for i in range(ob_dim): plt.subplot(ob_dim / 2, 2, i + 1) plt.plot(true_states[:, i], 'g') plt.plot(pred_states[:, i], 'r') self.fig.suptitle('MPE: ' + str(mpe)) self.fig.savefig(self.params['logdir'] + '/itr_' + str(itr) + '_predictions.png', dpi=200, bbox_inches='tight') # plot all intermediate losses during this iteration np.save(self.params['logdir'] + '/itr_' + str(itr) + '_losses.npy', all_losses) self.fig.clf() plt.plot(all_losses) self.fig.savefig(self.params['logdir'] + '/itr_' + str(itr) + '_losses.png', dpi=200, bbox_inches='tight')
class RL_Trainer(object): def __init__(self, params): ############# ## INIT ############# # Get params, create logger self.params = params self.logger = Logger(self.params['logdir']) # Set random seeds seed = self.params['seed'] np.random.seed(seed) torch.manual_seed(seed) ptu.init_gpu( use_gpu=not self.params['no_gpu'], gpu_id=self.params['which_gpu'] ) ############# ## ENV ############# # Make the gym environment register_custom_envs() self.env = gym.make(self.params['env_name']) if 'env_wrappers' in self.params: # These operations are currently only for Atari envs self.env = wrappers.Monitor(self.env, os.path.join(self.params['logdir'], "gym"), force=True) self.env = params['env_wrappers'](self.env) self.mean_episode_reward = -float('nan') self.best_mean_episode_reward = -float('inf') if 'non_atari_colab_env' in self.params and self.params['video_log_freq'] > 0: self.env = wrappers.Monitor(self.env, os.path.join(self.params['logdir'], "gym"), force=True) self.mean_episode_reward = -float('nan') self.best_mean_episode_reward = -float('inf') self.env.seed(seed) # import plotting (locally if 'obstacles' env) if not(self.params['env_name']=='obstacles-cs285-v0'): import matplotlib matplotlib.use('Agg') # Maximum length for episodes self.params['ep_len'] = self.params['ep_len'] or self.env.spec.max_episode_steps global MAX_VIDEO_LEN MAX_VIDEO_LEN = self.params['ep_len'] # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) # Are the observations images? img = len(self.env.observation_space.shape) > 2 self.params['agent_params']['discrete'] = discrete # Observation and action sizes ob_dim = self.env.observation_space.shape if img else self.env.observation_space.shape[0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[0] self.params['agent_params']['ac_dim'] = ac_dim self.params['agent_params']['ob_dim'] = ob_dim # simulation timestep, will be used for video saving if 'model' in dir(self.env): self.fps = 1/self.env.model.opt.timestep elif 'env_wrappers' in self.params: self.fps = 30 # This is not actually used when using the Monitor wrapper elif 'video.frames_per_second' in self.env.env.metadata.keys(): self.fps = self.env.env.metadata['video.frames_per_second'] else: self.fps = 10 ############# ## AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class(self.env, self.params['agent_params']) def run_training_loop(self, n_iter, collect_policy, eval_policy, initial_expertdata=None, relabel_with_expert=False, start_relabel_with_expert=1, expert_policy=None): """ :param n_iter: number of (dagger) iterations :param collect_policy: :param eval_policy: :param initial_expertdata: :param relabel_with_expert: whether to perform dagger :param start_relabel_with_expert: iteration at which to start relabel with expert :param expert_policy: """ # init vars at beginning of training self.total_envsteps = 0 self.start_time = time.time() print_period = 1000 if isinstance(self.agent, DQNAgent) else 1 for itr in range(n_iter): if itr % print_period == 0: print("\n\n********** Iteration %i ************"%itr) # decide if videos should be rendered/logged at this iteration if itr % self.params['video_log_freq'] == 0 and self.params['video_log_freq'] != -1: self.logvideo = True else: self.logvideo = False # decide if metrics should be logged if self.params['scalar_log_freq'] == -1: self.logmetrics = False elif itr % self.params['scalar_log_freq'] == 0: self.logmetrics = True else: self.logmetrics = False # collect trajectories, to be used for training if isinstance(self.agent, DQNAgent): # only perform an env step and add to replay buffer for DQN self.agent.step_env() envsteps_this_batch = 1 train_video_paths = None paths = None else: use_batchsize = self.params['batch_size'] if itr==0: use_batchsize = self.params['batch_size_initial'] paths, envsteps_this_batch, train_video_paths = ( self.collect_training_trajectories( itr, initial_expertdata, collect_policy, use_batchsize) ) self.total_envsteps += envsteps_this_batch # relabel the collected obs with actions from a provided expert policy # if relabel_with_expert and itr>=start_relabel_with_expert: # paths = self.do_relabel_with_expert(expert_policy, paths) # add collected data to replay buffer self.agent.add_to_replay_buffer(paths) # train agent (using sampled data from replay buffer) if itr % print_period == 0: print("\nTraining agent...") all_logs = self.train_agent() # log/save if self.logvideo or self.logmetrics: # perform logging print('\nBeginning logging procedure...') if isinstance(self.agent, DQNAgent): self.perform_dqn_logging(all_logs) else: self.perform_logging(itr, paths, eval_policy, train_video_paths, all_logs) if self.params['save_params']: self.agent.save('{}/agent_itr_{}.pt'.format(self.params['logdir'], itr)) #################################### #################################### def collect_training_trajectories(self, itr, initial_expertdata, collect_policy, num_transitions_to_sample, save_expert_data_to_disk=False): """ :param itr: :param load_initial_expertdata: path to expert data pkl file :param collect_policy: the current policy using which we collect data :param num_transitions_to_sample: the number of transitions we collect :return: paths: a list trajectories envsteps_this_batch: the sum over the numbers of environment steps in paths train_video_paths: paths which also contain videos for visualization purposes """ # TODO: get this from hw1 or hw2 ------------------ # decide how much training data to collect + which policy to use to collect it if itr == 0: if initial_expertdata is not None: paths = pickle.load(open(self.params['expert_data'], 'rb')) return paths, 0, None if save_expert_data_to_disk: num_transitions_to_sample = self.params['batch_size_initial'] # collect data to be used for training print("\nCollecting data to be used for training...") paths, envsteps_this_batch = utils.sample_trajectories(self.env, collect_policy, num_transitions_to_sample, self.params['ep_len']) # collect more rollouts with the same policy, to be saved as videos in tensorboard train_video_paths = None if self.logvideo: print('\nCollecting train rollouts to be used for saving videos...') train_video_paths = utils.sample_n_trajectories(self.env, collect_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) if save_expert_data_to_disk and itr == 0: with open('expert_data_{}.pkl'.format(self.params['env_name']), 'wb') as file: pickle.dump(paths, file) return paths, envsteps_this_batch, train_video_paths def train_agent(self): # TODO: get this from hw1 or hw2 -------------------- all_logs = [] for train_step in range(self.params['num_agent_train_steps_per_iter']): ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = self.agent.sample( self.params['train_batch_size']) train_log = self.agent.train(ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch) all_logs.append(train_log) return all_logs #################################### #################################### def perform_dqn_logging(self, all_logs): last_log = all_logs[-1] episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: self.mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward) logs = OrderedDict() logs["Train_EnvstepsSoFar"] = self.agent.t print("Timestep %d" % (self.agent.t,)) if self.mean_episode_reward > -5000: logs["Train_AverageReturn"] = np.mean(self.mean_episode_reward) print("mean reward (100 episodes) %f" % self.mean_episode_reward) if self.best_mean_episode_reward > -5000: logs["Train_BestReturn"] = np.mean(self.best_mean_episode_reward) print("best mean reward %f" % self.best_mean_episode_reward) if self.start_time is not None: time_since_start = (time.time() - self.start_time) print("running time %f" % time_since_start) logs["TimeSinceStart"] = time_since_start logs.update(last_log) sys.stdout.flush() for key, value in logs.items(): print('{} : {}'.format(key, value)) self.logger.log_scalar(value, key, self.agent.t) print('Done logging...\n\n') self.logger.flush() def perform_logging(self, itr, paths, eval_policy, train_video_paths, all_logs): last_log = all_logs[-1] ####################### # collect eval trajectories, for logging print("\nCollecting data for eval...") eval_paths, eval_envsteps_this_batch = utils.sample_trajectories(self.env, eval_policy, self.params['eval_batch_size'], self.params['ep_len']) # save eval rollouts as videos in tensorboard event file if self.logvideo and train_video_paths != None: print('\nCollecting video rollouts eval') eval_video_paths = utils.sample_n_trajectories(self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) #save train/eval videos print('\nSaving train rollouts as videos...') self.logger.log_paths_as_videos(train_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title='train_rollouts') self.logger.log_paths_as_videos(eval_video_paths, itr, fps=self.fps,max_videos_to_save=MAX_NVIDEO, video_title='eval_rollouts') ####################### # save eval metrics if self.logmetrics: # returns, for logging train_returns = [path["reward"].sum() for path in paths] eval_returns = [eval_path["reward"].sum() for eval_path in eval_paths] # episode lengths, for logging train_ep_lens = [len(path["reward"]) for path in paths] eval_ep_lens = [len(eval_path["reward"]) for eval_path in eval_paths] # decide what to log logs = OrderedDict() logs["Eval_AverageReturn"] = np.mean(eval_returns) logs["Eval_StdReturn"] = np.std(eval_returns) logs["Eval_MaxReturn"] = np.max(eval_returns) logs["Eval_MinReturn"] = np.min(eval_returns) logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens) logs["Train_AverageReturn"] = np.mean(train_returns) logs["Train_StdReturn"] = np.std(train_returns) logs["Train_MaxReturn"] = np.max(train_returns) logs["Train_MinReturn"] = np.min(train_returns) logs["Train_AverageEpLen"] = np.mean(train_ep_lens) logs["Train_EnvstepsSoFar"] = self.total_envsteps logs["TimeSinceStart"] = time.time() - self.start_time logs.update(last_log) if itr == 0: self.initial_return = np.mean(train_returns) logs["Initial_DataCollection_AverageReturn"] = self.initial_return # perform the logging for key, value in logs.items(): print('{} : {}'.format(key, value)) self.logger.log_scalar(value, key, itr) print('Done logging...\n\n') self.logger.flush()
def __init__(self, params): ############# ## INIT ############# # Get params, create logger self.params = params self.logger = Logger(self.params["logdir"]) # Set random seeds seed = self.params["seed"] np.random.seed(seed) torch.manual_seed(seed) ptu.init_gpu(use_gpu=not self.params["no_gpu"], gpu_id=self.params["which_gpu"]) ############# ## ENV ############# # Make the gym environment register_custom_envs() self.env = gym.make(self.params["env_name"]) self.eval_env = gym.make(self.params["env_name"]) if not ("pointmass" in self.params["env_name"]): import matplotlib matplotlib.use("Agg") self.env.set_logdir(self.params["logdir"] + "/expl_") self.eval_env.set_logdir(self.params["logdir"] + "/eval_") if "env_wrappers" in self.params: # These operations are currently only for Atari envs self.env = wrappers.Monitor(self.env, os.path.join(self.params["logdir"], "gym"), force=True) self.eval_env = wrappers.Monitor(self.eval_env, os.path.join( self.params["logdir"], "gym"), force=True) self.env = params["env_wrappers"](self.env) self.eval_env = params["env_wrappers"](self.eval_env) self.mean_episode_reward = -float("nan") self.best_mean_episode_reward = -float("inf") if "non_atari_colab_env" in self.params and self.params[ "video_log_freq"] > 0: self.env = wrappers.Monitor( self.env, os.path.join(self.params["logdir"], "gym"), write_upon_reset=True, ) # , force=True) self.eval_env = wrappers.Monitor( self.eval_env, os.path.join(self.params["logdir"], "gym"), write_upon_reset=True, ) self.mean_episode_reward = -float("nan") self.best_mean_episode_reward = -float("inf") self.env.seed(seed) self.eval_env.seed(seed) # Maximum length for episodes self.params["ep_len"] = self.params[ "ep_len"] or self.env.spec.max_episode_steps global MAX_VIDEO_LEN MAX_VIDEO_LEN = self.params["ep_len"] # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) # Are the observations images? img = len(self.env.observation_space.shape) > 2 self.params["agent_params"]["discrete"] = discrete # Observation and action sizes ob_dim = (self.env.observation_space.shape if img else self.env.observation_space.shape[0]) ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[ 0] self.params["agent_params"]["ac_dim"] = ac_dim self.params["agent_params"]["ob_dim"] = ob_dim # simulation timestep, will be used for video saving if "model" in dir(self.env): self.fps = 1 / self.env.model.opt.timestep elif "env_wrappers" in self.params: self.fps = 30 # This is not actually used when using the Monitor wrapper elif "video.frames_per_second" in self.env.env.metadata.keys(): self.fps = self.env.env.metadata["video.frames_per_second"] else: self.fps = 10 ############# ## AGENT ############# agent_class = self.params["agent_class"] self.agent = agent_class(self.env, self.params["agent_params"])
class RL_Trainer(object): def __init__(self, params): ############# ## INIT ############# # Get params, create logger, create TF session self.params = params self.logger = Logger(self.params['logdir']) # Set random seeds seed = self.params['seed'] torch.manual_seed(seed) np.random.seed(seed) ############# ## ENV ############# # Make the gym environment self.env = gym.make(self.params['env_name']) self.env.seed(seed) # Maximum length for episodes self.params['ep_len'] = self.params['ep_len'] or self.env.spec.max_episode_steps MAX_VIDEO_LEN = self.params['ep_len'] # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) self.params['agent_params']['discrete'] = discrete # Observation and action sizes ob_dim = self.env.observation_space.shape[0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[0] self.params['agent_params']['ac_dim'] = ac_dim self.params['agent_params']['ob_dim'] = ob_dim # simulation timestep, will be used for video saving if 'model' in dir(self.env): self.fps = 1/self.env.model.opt.timestep else: self.fps = self.env.env.metadata['video.frames_per_second'] ############# ## AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class(self.env, self.params['agent_params']) def run_training_loop(self, n_iter, collect_policy, eval_policy, initial_expertdata=None, relabel_with_expert=False, start_relabel_with_expert=1, expert_policy=None): """ :param n_iter: number of (dagger) iterations :param collect_policy: :param eval_policy: :param initial_expertdata: :param relabel_with_expert: whether to perform dagger :param start_relabel_with_expert: iteration at which to start relabel with expert :param expert_policy: """ # init vars at beginning of training self.total_envsteps = 0 self.start_time = time.time() for itr in range(n_iter): print("\n\n********** Iteration %i ************"%itr) # decide if videos should be rendered/logged at this iteration if itr % self.params['video_log_freq'] == 0 and self.params['video_log_freq'] != -1: self.log_video = True else: self.log_video = False # decide if metrics should be logged if itr % self.params['scalar_log_freq'] == 0: self.log_metrics = True else: self.log_metrics = False # collect trajectories, to be used for training with torch.no_grad(): training_returns = self.collect_training_trajectories(itr, initial_expertdata, collect_policy, self.params['batch_size']) paths, envsteps_this_batch, train_video_paths = training_returns self.total_envsteps += envsteps_this_batch # add collected data to replay buffer self.agent.add_to_replay_buffer(paths) # train agent (using sampled data from replay buffer) loss = self.train_agent() # log/save if self.log_video or self.log_metrics: # perform logging print('\nBeginning logging procedure...') self.perform_logging(itr, paths, eval_policy, train_video_paths, loss) if self.params['save_params']: # save policy print('\nSaving agent\'s actor...') self.agent.actor.save(self.params['logdir'] + '/policy_itr_'+str(itr)) #################################### #################################### def collect_training_trajectories(self, itr, load_initial_expertdata, collect_policy, batch_size): if itr == 0 and load_initial_expertdata: with open(load_initial_expertdata, "rb") as f: loaded_paths = pickle.load(f) return loaded_paths, 0, None print("\nCollecting data to be used for training...") paths, envsteps_this_batch = sample_trajectories(self.env, collect_policy, batch_size, self.params['ep_len']) train_video_paths = None if self.log_video: print('\nCollecting train rollouts to be used for saving videos...') train_video_paths = sample_n_trajectories(self.env, collect_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) return paths, envsteps_this_batch, train_video_paths def train_agent(self): print('\nTraining agent using sampled data from replay buffer...') for train_step in range(self.params['num_agent_train_steps_per_iter']): ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = self.agent.sample(self.params['train_batch_size']) loss = self.agent.train(ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch) return loss #################################### #################################### def perform_logging(self, itr, paths, eval_policy, train_video_paths, loss): # collect eval trajectories, for logging print("\nCollecting data for eval...") eval_paths, eval_envsteps_this_batch = sample_trajectories(self.env, eval_policy, self.params['eval_batch_size'], self.params['ep_len']) # save eval rollouts as videos in tensorboard event file if self.log_video and train_video_paths != None: print('\nCollecting video rollouts eval') eval_video_paths = sample_n_trajectories(self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) #save train/eval videos print('\nSaving train rollouts as videos...') self.logger.log_paths_as_videos(train_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title='train_rollouts') self.logger.log_paths_as_videos(eval_video_paths, itr, fps=self.fps,max_videos_to_save=MAX_NVIDEO, video_title='eval_rollouts') # save eval metrics if self.log_metrics: # returns, for logging train_returns = [path["reward"].sum() for path in paths] eval_returns = [eval_path["reward"].sum() for eval_path in eval_paths] # episode lengths, for logging train_ep_lens = [len(path["reward"]) for path in paths] eval_ep_lens = [len(eval_path["reward"]) for eval_path in eval_paths] # decide what to log logs = OrderedDict() logs["Loss"] = loss.cpu().detach().numpy() logs["Eval_AverageReturn"] = np.mean(eval_returns) logs["Eval_StdReturn"] = np.std(eval_returns) logs["Eval_MaxReturn"] = np.max(eval_returns) logs["Eval_MinReturn"] = np.min(eval_returns) logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens) logs["Train_AverageReturn"] = np.mean(train_returns) logs["Train_StdReturn"] = np.std(train_returns) logs["Train_MaxReturn"] = np.max(train_returns) logs["Train_MinReturn"] = np.min(train_returns) logs["Train_AverageEpLen"] = np.mean(train_ep_lens) logs["Train_EnvstepsSoFar"] = self.total_envsteps logs["TimeSinceStart"] = time.time() - self.start_time if itr == 0: self.initial_return = np.mean(train_returns) logs["Initial_DataCollection_AverageReturn"] = self.initial_return # perform the logging for key, value in logs.items(): print('{} : {}'.format(key, value)) self.logger.log_scalar(value, key, itr) print('Done logging...\n\n') self.logger.flush()
def __init__(self, params): ############# ## INIT ############# # Get params, create logger, create TF session self.params = params self.logger = Logger(self.params['logdir']) self.sess = create_tf_session(self.params['use_gpu'], which_gpu=self.params['which_gpu']) # Set random seeds seed = self.params['seed'] tf.set_random_seed(seed) np.random.seed(seed) ############# ## ENV ############# # Make the gym environment self.env = gym.make(self.params['env_name']) self.env.seed(seed) # Maximum length for episodes self.params['ep_len'] = self.params[ 'ep_len'] or self.env.spec.max_episode_steps # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) self.params['agent_params']['discrete'] = discrete # Observation and action sizes ob_dim = self.env.observation_space.shape[0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[ 0] self.params['agent_params']['ac_dim'] = ac_dim self.params['agent_params']['ob_dim'] = ob_dim print('\n ep_len: {0}'.format(self.params['ep_len'])) print('\ndiscrete: {0}'.format(discrete)) print('\nob_dim: {0}'.format(ob_dim)) print('\nac_dim: {0}'.format(ac_dim)) # simulation timestep, will be used for video saving if 'model' in dir(self.env): self.fps = 1 / self.env.model.opt.timestep else: self.fps = self.env.env.metadata['video.frames_per_second'] ############# ## AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class(self.sess, self.env, self.params['agent_params']) ############# ## INIT VARS ############# ## TODO initialize all of the TF variables (that were created by agent, etc.) ## HINT: use global_variables_initializer self.sess.run(tf.global_variables_initializer()) ############# ## INIT WANDB ############# self.init_wandb()
def __init__(self, params): ############# # INIT ############# # Get params, create logger, create TF session self.params = params self.logger = Logger(self.params['logdir']) self.sess = create_tf_session(self.params['use_gpu'], which_gpu=self.params['which_gpu']) # Set random seeds seed = self.params['seed'] tf.set_random_seed(seed) np.random.seed(seed) ############# # ENV ############# # Make the gym environment self.env = gym.make(self.params['env_name']) if 'env_wrappers' in self.params: # These operations are currently only for Atari envs self.env = wrappers.Monitor(self.env, os.path.join(self.params['logdir'], "gym"), force=True) self.env = params['env_wrappers'](self.env) self.mean_episode_reward = -float('nan') self.best_mean_episode_reward = -float('inf') self.env.seed(seed) # Maximum length for episodes self.params['ep_len'] = self.params[ 'ep_len'] or self.env.spec.max_episode_steps MAX_VIDEO_LEN = self.params['ep_len'] # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) # Are the observations images? img = len(self.env.observation_space.shape) > 2 self.params['agent_params']['discrete'] = discrete # Observation and action sizes ob_dim = self.env.observation_space.shape if img else self.env.observation_space.shape[ 0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[ 0] self.params['agent_params']['ac_dim'] = ac_dim self.params['agent_params']['ob_dim'] = ob_dim # simulation timestep, will be used for video saving if 'model' in dir(self.env): self.fps = 1 / self.env.model.opt.timestep elif 'env_wrappers' in self.params: self.fps = 30 # This is not actually used when using the Monitor wrapper else: self.fps = self.env.env.metadata['video.frames_per_second'] ############# # AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class(self.sess, self.env, self.params['agent_params']) ############# # INIT VARS ############# tf.global_variables_initializer().run(session=self.sess)
class RL_Trainer(object): def __init__(self, params): ############# ## INIT ############# # Get params, create logger, create TF session self.params = params self.logger = Logger(self.params['logdir']) self.sess = create_tf_session(self.params['use_gpu'], which_gpu=self.params['which_gpu']) # Set random seeds seed = self.params['seed'] tf.set_random_seed(seed) np.random.seed(seed) ############# ## ENV ############# # Make the gym environment self.env = gym.make(self.params['env_name']) self.env.seed(seed) # Maximum length for episodes self.params['ep_len'] = self.params[ 'ep_len'] or self.env.spec.max_episode_steps # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) self.params['agent_params']['discrete'] = discrete # Observation and action sizes ob_dim = self.env.observation_space.shape[0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[ 0] self.params['agent_params']['ac_dim'] = ac_dim self.params['agent_params']['ob_dim'] = ob_dim # simulation timestep, will be used for video saving if 'model' in dir(self.env): self.fps = 1 / self.env.model.opt.timestep else: self.fps = self.env.env.metadata['video.frames_per_second'] ############# ## AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class(self.sess, self.env, self.params['agent_params']) ############# ## INIT VARS ############# self.sess.run(tf.global_variables_initializer()) def run_training_loop(self, n_iter, collect_policy, eval_policy, initial_expertdata=None, relabel_with_expert=False, start_relabel_with_expert=1, expert_policy=None): """ :param n_iter: number of (dagger) iterations :param collect_policy: :param eval_policy: :param initial_expertdata: :param relabel_with_expert: whether to perform dagger :param start_relabel_with_expert: iteration at which to start relabel with expert :param expert_policy: """ # init vars at beginning of training self.total_envsteps = 0 self.start_time = time.time() for itr in range(n_iter): print("\n\n********** Iteration %i ************" % itr) # decide if videos should be rendered/logged at this iteration if itr % self.params['video_log_freq'] == 0 and self.params[ 'video_log_freq'] != -1: self.log_video = True else: self.log_video = False # decide if metrics should be logged if itr % self.params['scalar_log_freq'] == 0: self.log_metrics = True else: self.log_metrics = False # collect trajectories, to be used for training training_returns = self.collect_training_trajectories( itr, initial_expertdata, collect_policy, self.params['batch_size']) paths, envsteps_this_batch, train_video_paths = training_returns self.total_envsteps += envsteps_this_batch # relabel the collected obs with actions from a provided expert policy if relabel_with_expert and itr >= start_relabel_with_expert: paths = self.do_relabel_with_expert(expert_policy, paths) # add collected data to replay buffer self.agent.add_to_replay_buffer(paths) # train agent (using sampled data from replay buffer) self.train_agent() # log/save if self.log_video or self.log_metrics: # perform logging print('\nBeginning logging procedure...') self.perform_logging(itr, paths, eval_policy, train_video_paths) # save policy print('\nSaving agent\'s actor...') self.agent.actor.save(self.params['logdir'] + '/policy_itr_' + str(itr)) #################################### #################################### def collect_training_trajectories(self, itr, load_initial_expertdata, collect_policy, batch_size): """ :param itr: :param load_initial_expertdata: path to expert data pkl file :param collect_policy: the current policy using which we collect data :param batch_size: the number of transitions we collect :return: paths: a list trajectories envsteps_this_batch: the sum over the numbers of environment steps in paths train_video_paths: paths which also contain videos for visualization purposes """ if itr == 0: with open(load_initial_expertdata, "rb") as f: loaded_paths = pickle.load(f) return loaded_paths, 0, None print("\nCollecting data to be used for training...") paths, envsteps_this_batch = sample_trajectories( self.env, collect_policy, min_timesteps_per_batch=batch_size, ## This is confusing: we use batch_size to lower-bound total timesteps (counted over all trajs collected) # and use 'ep_len' to upper-bound timesteps in ONE traj max_path_length=self.params['ep_len']) # collect more rollouts with the same policy, to be saved as videos in tensorboard # note: here, we collect MAX_NVIDEO rollouts, each of length MAX_VIDEO_LEN train_video_paths = None if self.log_video: print( '\nCollecting train rollouts to be used for saving videos...') train_video_paths = sample_n_trajectories(self.env, collect_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) return paths, envsteps_this_batch, train_video_paths def train_agent(self): print('\nTraining agent using sampled data from replay buffer...') for train_step in range(self.params['num_agent_train_steps_per_iter']): ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = \ self.agent.sample(self.params['train_batch_size']) # HINT: print or plot the loss for debugging! self.agent.train(ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch) def do_relabel_with_expert(self, expert_policy, paths): print( "\nRelabelling collected observations with labels from an expert policy..." ) for i in range(len(paths)): paths[i]["action"] = expert_policy.get_action( paths[i]["observation"]) return paths #################################### #################################### def perform_logging(self, itr, paths, eval_policy, train_video_paths): # collect eval trajectories, for logging print("\nCollecting data for eval...") eval_paths, eval_envsteps_this_batch = sample_trajectories( self.env, eval_policy, self.params['eval_batch_size'], self.params['ep_len']) # save eval rollouts as videos in tensorboard event file if self.log_video and train_video_paths != None: print('\nCollecting video rollouts eval') eval_video_paths = sample_n_trajectories(self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) #save train/eval videos print('\nSaving train rollouts as videos...') self.logger.log_paths_as_videos(train_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title='train_rollouts') self.logger.log_paths_as_videos(eval_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title='eval_rollouts') # save eval metrics if self.log_metrics: # returns, for logging train_returns = [path["reward"].sum() for path in paths] eval_returns = [ eval_path["reward"].sum() for eval_path in eval_paths ] # episode lengths, for logging train_ep_lens = [len(path["reward"]) for path in paths] eval_ep_lens = [ len(eval_path["reward"]) for eval_path in eval_paths ] # decide what to log logs = OrderedDict() logs["Eval_AverageReturn"] = np.mean(eval_returns) logs["Eval_StdReturn"] = np.std(eval_returns) logs["Eval_MaxReturn"] = np.max(eval_returns) logs["Eval_MinReturn"] = np.min(eval_returns) logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens) logs["Train_AverageReturn"] = np.mean(train_returns) logs["Train_StdReturn"] = np.std(train_returns) logs["Train_MaxReturn"] = np.max(train_returns) logs["Train_MinReturn"] = np.min(train_returns) logs["Train_AverageEpLen"] = np.mean(train_ep_lens) logs["Train_EnvstepsSoFar"] = self.total_envsteps logs["TimeSinceStart"] = time.time() - self.start_time if itr == 0: self.initial_return = np.mean(train_returns) logs["Initial_DataCollection_AverageReturn"] = self.initial_return # perform the logging for key, value in logs.items(): print('{} : {}'.format(key, value)) self.logger.log_scalar(value, key, itr) print('Done logging...\n\n') self.logger.flush()
def __init__(self, params): ############# ## INIT ############# # Get params, create logger self.params = params self.logger = Logger(self.params['logdir']) # Set random seeds seed = self.params['seed'] np.random.seed(seed) tf.random.set_seed(seed) ############# ## ENV ############# # Make the gym environment register_custom_envs() self.env = gym.make(self.params['env_name']) self.eval_env = gym.make(self.params['env_name']) if not ('pointmass' in self.params['env_name']): import matplotlib matplotlib.use('Agg') self.env.set_logdir(self.params['logdir'] + '/expl_') self.eval_env.set_logdir(self.params['logdir'] + '/eval_') if 'env_wrappers' in self.params: # These operations are currently only for Atari envs self.env = wrappers.Monitor(self.env, os.path.join(self.params['logdir'], "gym"), force=True) self.eval_env = wrappers.Monitor(self.eval_env, os.path.join( self.params['logdir'], "gym"), force=True) self.env = params['env_wrappers'](self.env) self.eval_env = params['env_wrappers'](self.eval_env) self.mean_episode_reward = -float('nan') self.best_mean_episode_reward = -float('inf') if 'non_atari_colab_env' in self.params and self.params[ 'video_log_freq'] > 0: self.env = wrappers.Monitor(self.env, os.path.join(self.params['logdir'], "gym"), write_upon_reset=True) #, force=True) self.eval_env = wrappers.Monitor(self.eval_env, os.path.join( self.params['logdir'], "gym"), write_upon_reset=True) self.mean_episode_reward = -float('nan') self.best_mean_episode_reward = -float('inf') self.env.seed(seed) self.eval_env.seed(seed) # Maximum length for episodes self.params['ep_len'] = self.params[ 'ep_len'] or self.env.spec.max_episode_steps global MAX_VIDEO_LEN MAX_VIDEO_LEN = self.params['ep_len'] # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) # Are the observations images? img = len(self.env.observation_space.shape) > 2 self.params['agent_params']['discrete'] = discrete # Observation and action sizes ob_dim = self.env.observation_space.shape if img else self.env.observation_space.shape[ 0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[ 0] self.params['agent_params']['ac_dim'] = ac_dim self.params['agent_params']['ob_dim'] = ob_dim # simulation timestep, will be used for video saving if 'model' in dir(self.env): self.fps = 1 / self.env.model.opt.timestep elif 'env_wrappers' in self.params: self.fps = 30 # This is not actually used when using the Monitor wrapper elif 'video.frames_per_second' in self.env.env.metadata.keys(): self.fps = self.env.env.metadata['video.frames_per_second'] else: self.fps = 10 ############# ## AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class(self.env, self.params['agent_params'])
def __init__(self, params): ############# ## INIT ############# # Get params, create logger self.params = params self.logger = Logger(self.params['logdir']) # Set random seeds seed = self.params['seed'] np.random.seed(seed) torch.manual_seed(seed) ptu.init_gpu(use_gpu=not self.params['no_gpu'], gpu_id=self.params['which_gpu']) ############# ## ENV ############# # Make the gym environment self.env = gym.make(self.params['env_name']) self.env.seed(seed) # import plotting (locally if 'obstacles' env) if not (self.params['env_name'] == 'obstacles-cs285-v0'): import matplotlib matplotlib.use('Agg') # Maximum length for episodes self.params['ep_len'] = self.params[ 'ep_len'] or self.env.spec.max_episode_steps global MAX_VIDEO_LEN MAX_VIDEO_LEN = self.params['ep_len'] # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) # Are the observations images? img = len(self.env.observation_space.shape) > 2 self.params['agent_params']['discrete'] = discrete # Observation and action sizes ob_dim = self.env.observation_space.shape if img else self.env.observation_space.shape[ 0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[ 0] self.params['agent_params']['ac_dim'] = ac_dim self.params['agent_params']['ob_dim'] = ob_dim # simulation timestep, will be used for video saving if 'model' in dir(self.env): self.fps = 1 / self.env.model.opt.timestep elif 'env_wrappers' in self.params: self.fps = 30 # This is not actually used when using the Monitor wrapper elif 'video.frames_per_second' in self.env.env.metadata.keys(): self.fps = self.env.env.metadata['video.frames_per_second'] else: self.fps = 10 ############# ## AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class(self.env, self.params['agent_params'])
class RL_Trainer(object): def __init__(self, params): ############# ## INIT ############# # Get params, create logger self.params = params self.logger = Logger(self.params['logdir']) # Set random seeds seed = self.params['seed'] np.random.seed(seed) torch.manual_seed(seed) ptu.init_gpu(use_gpu=not self.params['no_gpu'], gpu_id=self.params['which_gpu']) ############# ## ENV ############# # Make the gym environment self.env = gym.make(self.params['env_name']) self.env.seed(seed) # import plotting (locally if 'obstacles' env) if not (self.params['env_name'] == 'obstacles-cs285-v0'): import matplotlib matplotlib.use('Agg') # Maximum length for episodes self.params['ep_len'] = self.params[ 'ep_len'] or self.env.spec.max_episode_steps global MAX_VIDEO_LEN MAX_VIDEO_LEN = self.params['ep_len'] # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) # Are the observations images? img = len(self.env.observation_space.shape) > 2 self.params['agent_params']['discrete'] = discrete # Observation and action sizes ob_dim = self.env.observation_space.shape if img else self.env.observation_space.shape[ 0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[ 0] self.params['agent_params']['ac_dim'] = ac_dim self.params['agent_params']['ob_dim'] = ob_dim # simulation timestep, will be used for video saving if 'model' in dir(self.env): self.fps = 1 / self.env.model.opt.timestep elif 'env_wrappers' in self.params: self.fps = 30 # This is not actually used when using the Monitor wrapper elif 'video.frames_per_second' in self.env.env.metadata.keys(): self.fps = self.env.env.metadata['video.frames_per_second'] else: self.fps = 10 ############# ## AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class(self.env, self.params['agent_params']) def run_training_loop(self, n_iter, collect_policy, eval_policy, initial_expertdata=None, relabel_with_expert=False, start_relabel_with_expert=1, expert_policy=None): """ :param n_iter: number of (dagger) iterations :param collect_policy: :param eval_policy: :param initial_expertdata: :param relabel_with_expert: whether to perform dagger :param start_relabel_with_expert: iteration at which to start relabel with expert :param expert_policy: """ # init vars at beginning of training self.total_envsteps = 0 self.start_time = time.time() for itr in range(n_iter): print("\n\n********** Iteration %i ************" % itr) # decide if videos should be rendered/logged at this iteration if itr % self.params['video_log_freq'] == 0 and self.params[ 'video_log_freq'] != -1: self.logvideo = True else: self.logvideo = False self.log_video = self.logvideo # decide if metrics should be logged if self.params['scalar_log_freq'] == -1: self.logmetrics = False elif itr % self.params['scalar_log_freq'] == 0: self.logmetrics = True else: self.logmetrics = False # collect trajectories, to be used for training training_returns = self.collect_training_trajectories( itr, initial_expertdata, collect_policy, self.params['batch_size']) paths, envsteps_this_batch, train_video_paths = training_returns self.total_envsteps += envsteps_this_batch # add collected data to replay buffer self.agent.add_to_replay_buffer(paths) # train agent (using sampled data from replay buffer) train_logs = self.train_agent() # log/save if self.logvideo or self.logmetrics: # perform logging print('\nBeginning logging procedure...') self.perform_logging(itr, paths, eval_policy, train_video_paths, train_logs) if self.params['save_params']: self.agent.save('{}/agent_itr_{}.pt'.format( self.params['logdir'], itr)) #################################### #################################### def collect_training_trajectories(self, itr, load_initial_expertdata, collect_policy, batch_size): # TODO: get this from hw1 # if your load_initial_expertdata is None, then you need to collect new trajectories at *every* iteration if itr == 0 and load_initial_expertdata is not None: with open(load_initial_expertdata, 'rb') as f: paths = pickle.load(f) envsteps_this_batch = 0 else: paths, envsteps_this_batch = utils.sample_trajectories( self.env, collect_policy, batch_size, self.params['ep_len']) # collect more rollouts with the same policy, to be saved as videos in tensorboard # note: here, we collect MAX_NVIDEO rollouts, each of length MAX_VIDEO_LEN train_video_paths = None if self.log_video: print( '\nCollecting train rollouts to be used for saving videos...') ## TODO look in utils and implement sample_n_trajectories train_video_paths = utils.sample_n_trajectories( self.env, collect_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) return paths, envsteps_this_batch, train_video_paths def train_agent(self): # TODO: get this from hw1 print('\nTraining agent using sampled data from replay buffer...') train_logs = [] for train_step in range(self.params['num_agent_train_steps_per_iter']): # TODO sample some data from the data buffer # HINT1: use the agent's sample function # HINT2: how much data = self.params['train_batch_size'] ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch, logprob_batch = self.agent.sample( self.params['train_batch_size']) # TODO use the sampled data to train an agent # HINT: use the agent's train function # HINT: keep the agent's training log for debugging train_log = self.agent.train(ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch, logprob_batch) train_logs.append(train_log) return train_logs #################################### #################################### def perform_logging(self, itr, paths, eval_policy, train_video_paths, all_logs): last_log = all_logs[-1] ####################### # collect eval trajectories, for logging print("\nCollecting data for eval...") eval_paths, eval_envsteps_this_batch = utils.sample_trajectories( self.env, eval_policy, self.params['eval_batch_size'], self.params['ep_len']) # save eval rollouts as videos in tensorboard event file if self.logvideo and train_video_paths != None: print('\nCollecting video rollouts eval') eval_video_paths = utils.sample_n_trajectories( self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) #save train/eval videos print('\nSaving train rollouts as videos...') self.logger.log_paths_as_videos(train_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title='train_rollouts') self.logger.log_paths_as_videos(eval_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title='eval_rollouts') ####################### # save eval metrics if self.logmetrics: # returns, for logging train_returns = [path["reward"].sum() for path in paths] eval_returns = [ eval_path["reward"].sum() for eval_path in eval_paths ] # episode lengths, for logging train_ep_lens = [len(path["reward"]) for path in paths] eval_ep_lens = [ len(eval_path["reward"]) for eval_path in eval_paths ] # decide what to log logs = OrderedDict() logs["Eval_AverageReturn"] = np.mean(eval_returns) logs["Eval_StdReturn"] = np.std(eval_returns) logs["Eval_MaxReturn"] = np.max(eval_returns) logs["Eval_MinReturn"] = np.min(eval_returns) logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens) logs["Train_AverageReturn"] = np.mean(train_returns) logs["Train_StdReturn"] = np.std(train_returns) logs["Train_MaxReturn"] = np.max(train_returns) logs["Train_MinReturn"] = np.min(train_returns) logs["Train_AverageEpLen"] = np.mean(train_ep_lens) logs["Train_EnvstepsSoFar"] = self.total_envsteps logs["TimeSinceStart"] = time.time() - self.start_time logs.update(last_log) if itr == 0: self.initial_return = np.mean(train_returns) logs["Initial_DataCollection_AverageReturn"] = self.initial_return # perform the logging for key, value in logs.items(): print('{} : {}'.format(key, value)) self.logger.log_scalar(value, key, itr) print('Done logging...\n\n') self.logger.flush()
class RL_Trainer(object): def __init__(self, params): ############# ## INIT ############# # Get params, create logger, create TF session self.params = params self.logger = Logger(self.params['logdir']) self.sess = create_tf_session(self.params['use_gpu'], which_gpu=self.params['which_gpu']) # Set random seeds seed = self.params['seed'] tf.random.set_seed(seed) np.random.seed(seed) ############# ## ENV ############# # Make the gym environment self.env = gym.make(self.params['env_name']) if 'env_wrappers' in self.params: # These operations are currently only for Atari envs self.env = wrappers.Monitor(self.env, os.path.join(self.params['logdir'], "gym"), force=True) self.env = params['env_wrappers'](self.env) self.mean_episode_reward = -float('nan') self.best_mean_episode_reward = -float('inf') self.env.seed(seed) # Maximum length for episodes self.params['ep_len'] = self.params['ep_len'] or self.env.spec.max_episode_steps MAX_VIDEO_LEN = self.params['ep_len'] # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) # Are the observations images? img = len(self.env.observation_space.shape) > 2 self.params['agent_params']['discrete'] = discrete # Observation and action sizes ob_dim = self.env.observation_space.shape if img else self.env.observation_space.shape[0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[0] self.params['agent_params']['ac_dim'] = ac_dim self.params['agent_params']['ob_dim'] = ob_dim # simulation timestep, will be used for video saving if 'model' in dir(self.env): self.fps = 1/self.env.model.opt.timestep elif 'env_wrappers' in self.params: self.fps = 30 # This is not actually used when using the Monitor wrapper else: self.fps = self.env.env.metadata['video.frames_per_second'] ############# ## AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class(self.sess, self.env, self.params['agent_params']) ############# ## INIT VARS ############# tf.global_variables_initializer().run(session=self.sess) def run_training_loop(self, n_iter, collect_policy, eval_policy, initial_expertdata=None, relabel_with_expert=False, start_relabel_with_expert=1, expert_policy=None): """ :param n_iter: number of (dagger) iterations :param collect_policy: :param eval_policy: :param initial_expertdata: :param relabel_with_expert: whether to perform dagger :param start_relabel_with_expert: iteration at which to start relabel with expert :param expert_policy: """ # init vars at beginning of training self.total_envsteps = 0 self.start_time = time.time() for itr in range(n_iter): #print("\n\n********** Iteration %i ************"%itr) # decide if videos should be rendered/logged at this iteration if itr % self.params['video_log_freq'] == 0 and self.params['video_log_freq'] != -1: self.logvideo = True else: self.logvideo = False # decide if metrics should be logged if self.params['scalar_log_freq'] == -1: self.logmetrics = False elif itr % self.params['scalar_log_freq'] == 0: self.logmetrics = True else: self.logmetrics = False # collect trajectories, to be used for training if isinstance(self.agent, DQNAgent): # only perform an env step and add to replay buffer for DQN self.agent.step_env() envsteps_this_batch = 1 train_video_paths = None paths = None else: paths, envsteps_this_batch, train_video_paths = self.collect_training_trajectories(itr, initial_expertdata, collect_policy, self.params['batch_size']) self.total_envsteps += envsteps_this_batch # relabel the collected obs with actions from a provided expert policy if relabel_with_expert and itr>=start_relabel_with_expert: paths = self.do_relabel_with_expert(expert_policy, paths) # add collected data to replay buffer self.agent.add_to_replay_buffer(paths) # train agent (using sampled data from replay buffer) loss = self.train_agent() # log/save if self.logvideo or self.logmetrics: # perform logging print('\nBeginning logging procedure...') if isinstance(self.agent, DQNAgent): self.perform_dqn_logging() else: self.perform_logging(itr, paths, eval_policy, train_video_paths, loss) # save policy if self.params['save_params']: print('\nSaving agent\'s actor...') self.agent.actor.save(self.params['logdir'] + '/policy_itr_'+str(itr)) self.agent.critic.save(self.params['logdir'] + '/critic_itr_'+str(itr)) #################################### #################################### def collect_training_trajectories(self, itr, load_initial_expertdata, collect_policy, batch_size): # TODO: GETTHIS from HW1 def train_agent(self): # TODO: GETTHIS from HW1 def do_relabel_with_expert(self, expert_policy, paths): # TODO: GETTHIS from HW1 (although you don't actually need it for this homework) #################################### #################################### def perform_dqn_logging(self): episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: self.mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward) logs = OrderedDict() logs["Train_EnvstepsSoFar"] = self.agent.t print("Timestep %d" % (self.agent.t,)) if self.mean_episode_reward > -5000: logs["Train_AverageReturn"] = np.mean(self.mean_episode_reward) print("mean reward (100 episodes) %f" % self.mean_episode_reward) if self.best_mean_episode_reward > -5000: logs["Train_BestReturn"] = np.mean(self.best_mean_episode_reward) print("best mean reward %f" % self.best_mean_episode_reward) if self.start_time is not None: time_since_start = (time.time() - self.start_time) print("running time %f" % time_since_start) logs["TimeSinceStart"] = time_since_start sys.stdout.flush() for key, value in logs.items(): print('{} : {}'.format(key, value)) self.logger.log_scalar(value, key, self.agent.t) print('Done logging...\n\n') self.logger.flush() def perform_logging(self, itr, paths, eval_policy, train_video_paths, loss): # collect eval trajectories, for logging print("\nCollecting data for eval...") eval_paths, eval_envsteps_this_batch = sample_trajectories(self.env, eval_policy, self.params['eval_batch_size'], self.params['ep_len']) # save eval rollouts as videos in tensorboard event file if self.logvideo and train_video_paths != None: print('\nCollecting video rollouts eval') eval_video_paths = sample_n_trajectories(self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) #save train/eval videos print('\nSaving train rollouts as videos...') self.logger.log_paths_as_videos(train_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title='train_rollouts') self.logger.log_paths_as_videos(eval_video_paths, itr, fps=self.fps,max_videos_to_save=MAX_NVIDEO, video_title='eval_rollouts') # save eval metrics if self.logmetrics: # returns, for logging train_returns = [path["reward"].sum() for path in paths] eval_returns = [eval_path["reward"].sum() for eval_path in eval_paths] # episode lengths, for logging train_ep_lens = [len(path["reward"]) for path in paths] eval_ep_lens = [len(eval_path["reward"]) for eval_path in eval_paths] # decide what to log logs = OrderedDict() logs["Eval_AverageReturn"] = np.mean(eval_returns) logs["Eval_StdReturn"] = np.std(eval_returns) logs["Eval_MaxReturn"] = np.max(eval_returns) logs["Eval_MinReturn"] = np.min(eval_returns) logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens) logs["Train_AverageReturn"] = np.mean(train_returns) logs["Train_StdReturn"] = np.std(train_returns) logs["Train_MaxReturn"] = np.max(train_returns) logs["Train_MinReturn"] = np.min(train_returns) logs["Train_AverageEpLen"] = np.mean(train_ep_lens) logs["Train_EnvstepsSoFar"] = self.total_envsteps logs["TimeSinceStart"] = time.time() - self.start_time if isinstance(loss, dict): logs.update(loss) else: logs["Training loss"] = loss if itr == 0: self.initial_return = np.mean(train_returns) logs["Initial_DataCollection_AverageReturn"] = self.initial_return # perform the logging for key, value in logs.items(): print('{} : {}'.format(key, value)) self.logger.log_scalar(value, key, itr) print('Done logging...\n\n') self.logger.flush()
class RL_Trainer(object): def __init__(self, params): ############# ## INIT ############# # Get params, create logger, create TF session self.params = params self.logger = Logger(self.params['logdir']) # Set random seeds seed = self.params['seed'] np.random.seed(seed) torch.manual_seed(seed) ptu.init_gpu(use_gpu=not self.params['no_gpu'], gpu_id=self.params['which_gpu']) ############# ## ENV ############# # Make the gym environment self.env = gym.make(self.params['env_name']) self.env.seed(seed) # Maximum length for episodes self.params['ep_len'] = self.params[ 'ep_len'] or self.env.spec.max_episode_steps MAX_VIDEO_LEN = self.params['ep_len'] # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) self.params['agent_params']['discrete'] = discrete # Observation and action sizes ob_dim = self.env.observation_space.shape[0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[ 0] self.params['agent_params']['ac_dim'] = ac_dim self.params['agent_params']['ob_dim'] = ob_dim # simulation timestep, will be used for video saving if 'model' in dir(self.env): self.fps = 1 / self.env.model.opt.timestep else: self.fps = self.env.env.metadata['video.frames_per_second'] ############# ## AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class(self.env, self.params['agent_params']) def run_training_loop(self, n_iter, collect_policy, eval_policy, initial_expertdata=None, relabel_with_expert=False, start_relabel_with_expert=1, expert_policy=None): """ :param n_iter: number of (dagger) iterations :param collect_policy: :param eval_policy: :param initial_expertdata: :param relabel_with_expert: whether to perform dagger :param start_relabel_with_expert: iteration at which to start relabel with expert :param expert_policy: """ # init vars at beginning of training self.total_envsteps = 0 self.start_time = time.time() for itr in range(n_iter): print("\n\n********** Iteration %i ************" % itr) # decide if videos should be rendered/logged at this iteration if itr % self.params['video_log_freq'] == 0 and self.params[ 'video_log_freq'] != -1: self.log_video = True else: self.log_video = False # decide if metrics should be logged if itr % self.params['scalar_log_freq'] == 0: self.log_metrics = True else: self.log_metrics = False # collect trajectories, to be used for training training_returns = self.collect_training_trajectories( itr, initial_expertdata, collect_policy, self. params['batch_size']) # HW1: implement this function below paths, envsteps_this_batch, train_video_paths = training_returns self.total_envsteps += envsteps_this_batch # relabel the collected obs with actions from a provided expert policy if relabel_with_expert and itr >= start_relabel_with_expert: paths = self.do_relabel_with_expert( expert_policy, paths) # HW1: implement this function below # add collected data to replay buffer self.agent.add_to_replay_buffer(paths) # train agent (using sampled data from replay buffer) training_logs = self.train_agent( ) # HW1: implement this function below # log/save if self.log_video or self.log_metrics: # perform logging print('\nBeginning logging procedure...') self.perform_logging(itr, paths, eval_policy, train_video_paths, training_logs) if self.params['save_params']: print('\nSaving agent params') self.agent.save('{}/policy_itr_{}.pt'.format( self.params['logdir'], itr)) #################################### #################################### def collect_training_trajectories( self, itr, load_initial_expertdata, collect_policy, batch_size, ): """ :param itr: :param load_initial_expertdata: path to expert data pkl file :param collect_policy: the current policy using which we collect data :param batch_size: the number of transitions we collect :return: paths: a list trajectories envsteps_this_batch: the sum over the numbers of environment steps in paths train_video_paths: paths which also contain videos for visualization purposes """ # TODO decide whether to load training data or use the current policy to collect more data # HINT: depending on if it's the first iteration or not, decide whether to either # (1) load the data. In this case you can directly return as follows # ``` return loaded_paths, 0, None ``` # if it's the first iteration and you aren't loading data, then # `self.params['batch_size_initial']` is the number of transitions you want to collect if itr == 0: if load_initial_expertdata: with open(load_initial_expertdata, 'rb') as f: paths = pickle.load(f) return paths, 0, None else: batch_size = self.params['batch_size_initial'] # TODO collect `batch_size` samples to be used for training # HINT1: use sample_trajectories from utils # HINT2: you want each of these collected rollouts to be of length self.params['ep_len'] print("\nCollecting data to be used for training...") paths, envsteps_this_batch = utils.sample_trajectories( self.env, collect_policy, batch_size, self.params['ep_len']) # collect more rollouts with the same policy, to be saved as videos in tensorboard # note: here, we collect MAX_NVIDEO rollouts, each of length MAX_VIDEO_LEN train_video_paths = None if self.log_video: print( '\nCollecting train rollouts to be used for saving videos...') ## TODO look in utils and implement sample_n_trajectories train_video_paths = utils.sample_n_trajectories( self.env, collect_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) return paths, envsteps_this_batch, train_video_paths def train_agent(self): print('\nTraining agent using sampled data from replay buffer...') all_logs = [] for train_step in range(self.params['num_agent_train_steps_per_iter']): # TODO sample some data from the data buffer # HINT1: use the agent's sample function # HINT2: how much data = self.params['train_batch_size'] ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = self.agent.sample( self.params['train_batch_size']) # TODO use the sampled data to train an agent # HINT: use the agent's train function # HINT: keep the agent's training log for debugging train_log = self.agent.train(ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch) all_logs.append(train_log) return all_logs def do_relabel_with_expert(self, expert_policy, paths): print( "\nRelabelling collected observations with labels from an expert policy..." ) # TODO relabel collected obsevations (from our policy) with labels from an expert policy # HINT: query the policy (using the get_action function) with paths[i]["observation"] # and replace paths[i]["action"] with these expert labels for i in range(len(paths)): obs = paths[i]["observation"] paths[i]["action"] = expert_policy.get_action(obs) return paths #################################### #################################### def perform_logging(self, itr, paths, eval_policy, train_video_paths, training_logs): # collect eval trajectories, for logging print("\nCollecting data for eval...") eval_paths, eval_envsteps_this_batch = utils.sample_trajectories( self.env, eval_policy, self.params['eval_batch_size'], self.params['ep_len']) # save eval rollouts as videos in tensorboard event file if self.log_video and train_video_paths != None: print('\nCollecting video rollouts eval') eval_video_paths = utils.sample_n_trajectories( self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) #save train/eval videos print('\nSaving train rollouts as videos...') self.logger.log_paths_as_videos(train_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title='train_rollouts') self.logger.log_paths_as_videos(eval_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title='eval_rollouts') # save eval metrics if self.log_metrics: # returns, for logging train_returns = [path["reward"].sum() for path in paths] eval_returns = [ eval_path["reward"].sum() for eval_path in eval_paths ] # episode lengths, for logging train_ep_lens = [len(path["reward"]) for path in paths] eval_ep_lens = [ len(eval_path["reward"]) for eval_path in eval_paths ] # decide what to log logs = OrderedDict() logs["Eval_AverageReturn"] = np.mean(eval_returns) logs["Eval_StdReturn"] = np.std(eval_returns) logs["Eval_MaxReturn"] = np.max(eval_returns) logs["Eval_MinReturn"] = np.min(eval_returns) logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens) logs["Train_AverageReturn"] = np.mean(train_returns) logs["Train_StdReturn"] = np.std(train_returns) logs["Train_MaxReturn"] = np.max(train_returns) logs["Train_MinReturn"] = np.min(train_returns) logs["Train_AverageEpLen"] = np.mean(train_ep_lens) logs["Train_EnvstepsSoFar"] = self.total_envsteps logs["TimeSinceStart"] = time.time() - self.start_time last_log = training_logs[-1] # Only use the last log for now logs.update(last_log) if itr == 0: self.initial_return = np.mean(train_returns) logs["Initial_DataCollection_AverageReturn"] = self.initial_return # perform the logging for key, value in logs.items(): print('{} : {}'.format(key, value)) self.logger.log_scalar(value, key, itr) print('Done logging...\n\n') self.logger.flush()
class RL_Trainer(object): def __init__(self, params): ############# ## INIT ############# # Get params, create logger self.params = params self.logger = Logger(self.params["logdir"]) # Set random seeds seed = self.params["seed"] np.random.seed(seed) torch.manual_seed(seed) ptu.init_gpu(use_gpu=not self.params["no_gpu"], gpu_id=self.params["which_gpu"]) ############# ## ENV ############# # Make the gym environment self.env = gym.make(self.params["env_name"]) self.env.seed(seed) # import plotting (locally if 'obstacles' env) if not (self.params["env_name"] == "obstacles-cs285-v0"): import matplotlib matplotlib.use("Agg") # Maximum length for episodes self.params["ep_len"] = self.params["ep_len"] or self.env.spec.max_episode_steps global MAX_VIDEO_LEN MAX_VIDEO_LEN = self.params["ep_len"] # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) # Are the observations images? img = len(self.env.observation_space.shape) > 2 self.params["agent_params"]["discrete"] = discrete # Observation and action sizes ob_dim = ( self.env.observation_space.shape if img else self.env.observation_space.shape[0] ) ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[0] self.params["agent_params"]["ac_dim"] = ac_dim self.params["agent_params"]["ob_dim"] = ob_dim # simulation timestep, will be used for video saving if "model" in dir(self.env): self.fps = 1 / self.env.model.opt.timestep elif "env_wrappers" in self.params: self.fps = 30 # This is not actually used when using the Monitor wrapper elif "video.frames_per_second" in self.env.env.metadata.keys(): self.fps = self.env.env.metadata["video.frames_per_second"] else: self.fps = 10 ############# ## AGENT ############# agent_class = self.params["agent_class"] self.agent = agent_class(self.env, self.params["agent_params"]) def run_training_loop( self, n_iter, collect_policy, eval_policy, initial_expertdata=None, relabel_with_expert=False, start_relabel_with_expert=1, expert_policy=None, ): """ :param n_iter: number of (dagger) iterations :param collect_policy: :param eval_policy: :param initial_expertdata: :param relabel_with_expert: whether to perform dagger :param start_relabel_with_expert: iteration at which to start relabel with expert :param expert_policy: """ # init vars at beginning of training self.total_envsteps = 0 self.start_time = time.time() for itr in range(n_iter): print("\n\n********** Iteration %i ************" % itr) # decide if videos should be rendered/logged at this iteration if ( itr % self.params["video_log_freq"] == 0 and self.params["video_log_freq"] != -1 ): self.logvideo = True else: self.logvideo = False self.log_video = self.logvideo # decide if metrics should be logged if self.params["scalar_log_freq"] == -1: self.logmetrics = False elif itr % self.params["scalar_log_freq"] == 0: self.logmetrics = True else: self.logmetrics = False # collect trajectories, to be used for training training_returns = self.collect_training_trajectories( itr, initial_expertdata, collect_policy, self.params["batch_size"] ) paths, envsteps_this_batch, train_video_paths = training_returns self.total_envsteps += envsteps_this_batch # add collected data to replay buffer self.agent.add_to_replay_buffer(paths) # train agent (using sampled data from replay buffer) train_logs = self.train_agent() # log/save if self.logvideo or self.logmetrics: # perform logging print("\nBeginning logging procedure...") self.perform_logging( itr, paths, eval_policy, train_video_paths, train_logs ) if self.params["save_params"]: self.agent.save( "{}/agent_itr_{}.pt".format(self.params["logdir"], itr) ) #################################### #################################### def collect_training_trajectories( self, itr, load_initial_expertdata, collect_policy, batch_size ): # TODO: get this from hw1 # if your load_initial_expertdata is None, then you need to collect new trajectories at *every* iteration return paths, envsteps_this_batch, train_video_paths def train_agent(self): # TODO: get this from hw1 return train_logs #################################### #################################### def perform_logging(self, itr, paths, eval_policy, train_video_paths, all_logs): last_log = all_logs[-1] ####################### # collect eval trajectories, for logging print("\nCollecting data for eval...") eval_paths, eval_envsteps_this_batch = utils.sample_trajectories( self.env, eval_policy, self.params["eval_batch_size"], self.params["ep_len"] ) # save eval rollouts as videos in tensorboard event file if self.logvideo and train_video_paths != None: print("\nCollecting video rollouts eval") eval_video_paths = utils.sample_n_trajectories( self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True ) # save train/eval videos print("\nSaving train rollouts as videos...") self.logger.log_paths_as_videos( train_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title="train_rollouts", ) self.logger.log_paths_as_videos( eval_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, video_title="eval_rollouts", ) ####################### # save eval metrics if self.logmetrics: # returns, for logging train_returns = [path["reward"].sum() for path in paths] eval_returns = [eval_path["reward"].sum() for eval_path in eval_paths] # episode lengths, for logging train_ep_lens = [len(path["reward"]) for path in paths] eval_ep_lens = [len(eval_path["reward"]) for eval_path in eval_paths] # decide what to log logs = OrderedDict() logs["Eval_AverageReturn"] = np.mean(eval_returns) logs["Eval_StdReturn"] = np.std(eval_returns) logs["Eval_MaxReturn"] = np.max(eval_returns) logs["Eval_MinReturn"] = np.min(eval_returns) logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens) logs["Train_AverageReturn"] = np.mean(train_returns) logs["Train_StdReturn"] = np.std(train_returns) logs["Train_MaxReturn"] = np.max(train_returns) logs["Train_MinReturn"] = np.min(train_returns) logs["Train_AverageEpLen"] = np.mean(train_ep_lens) logs["Train_EnvstepsSoFar"] = self.total_envsteps logs["TimeSinceStart"] = time.time() - self.start_time logs.update(last_log) if itr == 0: self.initial_return = np.mean(train_returns) logs["Initial_DataCollection_AverageReturn"] = self.initial_return # perform the logging for key, value in logs.items(): print("{} : {}".format(key, value)) self.logger.log_scalar(value, key, itr) print("Done logging...\n\n") self.logger.flush()