def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) # callback = self._init_callback(callback) # with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ # as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) episode_rewards = [[0.0] * self.num_agents] #MA-MOD episode_successes = [] #callback.on_training_start(locals(), globals()) #callback.on_rollout_start() reset = True obs = self.env.reset() for _ in range(total_timesteps): # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): env_action = [] # MA-MOD for i in range(self.num_agents ): # MA-MOD. This is fine for one policy. action = self.act[i]( np.array(obs[i])[None], update_eps=update_eps, **kwargs )[0] # TODO: Is this the correct way to get the correct agent obs? env_action.append(action) reset = False new_obs, rew, done, info = self.env.step( env_action ) # NOUPDATE - env.step should take a vector of actions # print("Agent 1", type(new_obs[0])) # for row in new_obs[0]: # print(' '.join(str(int(item)) for item in row)) # print("Agent 2", type(new_obs[1])) # for row in new_obs[1]: # print(' '.join(str(int(item)) for item in row)) ''' Obs: x_me, x_opp --- agent 1. In env: x_1, x_2 Obs: x_me, x_opp -- agent 2. In env: x_2, x_1 Env: (n_agents, state_dim) ''' self.num_timesteps += 1 # Store transition in the replay buffer. # Loop for replay buffer -- either separate or joined. obs[agent_index], action[agent_index], reward[agent_index] # Joey: Does this look right to you? for num_agent in range(self.num_agents): self.replay_buffer.add(obs[num_agent], env_action[num_agent], rew[num_agent], new_obs[num_agent], float(done[num_agent])) obs = new_obs # if writer is not None: # ep_rew = np.array([rew]).reshape((1, -1)) # ep_done = np.array([done]).reshape((1, -1)) # tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, # self.num_timesteps) # TODO: current episode_rewards is a list, make it a list of lists where each list is the reward for each agent in all timesteps # append the newest reward to the end of each list for each agent if isinstance(done, list): done = np.array(done) if done.any(): for num_agent in range(self.num_agents): #MA-MOD episode_rewards[-1][num_agent] += rew[num_agent] maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append([0.0] * self.num_agents) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # callback.on_rollout_end() for i in range(self.num_agents): # MA-MOD # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking # if writer is not None: # # run loss backprop with summary, but once every 100 steps save the metadata # # (memory, compute time, ...) # if (1 + self.num_timesteps) % 100 == 0: # run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # run_metadata = tf.RunMetadata() # summary, td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1, # dones, weights, sess=self.sess, options=run_options, # run_metadata=run_metadata) # writer.add_run_metadata(run_metadata, 'step%d_agent%d' % (self.num_timesteps, i)) # else: # summary, td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1, # dones, weights, sess=self.sess) # writer.add_summary(summary, self.num_timesteps) # else: td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: # NOUPDATE - not inside main agent for loop new_priorities = np.abs( td_errors) + self.prioritized_replay_eps # NOUPDATE assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities( batch_idxes, new_priorities) # callback.on_rollout_start() if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. for i in range(self.num_agents): self.update_target[i](sess=self.sess) # MA-MOD if len(episode_rewards[-101:-1]) == 0: # MA-MOD mean_100ep_reward = -np.inf * np.ones((self.num_agents, )) else: mean_100ep_reward = np.mean(episode_rewards[-101:-1], axis=0) # below is what's logged in terminal. num_episodes = len(episode_rewards) #MA-MOD if self.verbose >= 1 and done.any( ) and log_interval is not None and len( episode_rewards) % log_interval == 0: #MA-MOD logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:], axis=0)) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() return self
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_successes = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() reset = True obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() each_reward_episode = 0 self.my_counter for _ in range(total_timesteps): # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True # with self.sess.as_default(): # action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] # env_action = action if self.my_counter % 10 == 1: if self.counter_success < 4: self.Predicted_Q = IMC(self.Q, self.Nq, self.action_features, self.state_features, 1) if self.episode_num < 1000 and self.my_counter > 0: a = self.StateHash(self.current_state) env_action = np.argmax(self.Predicted_Q[:, a]) #print(env_action) if self.episode_num == 40: if self.counter_success < 4: print('init') self.num_timesteps = 0 self.episode_num = 0 self.my_counter = 0 self.counter_success = 0 self.Q = np.zeros_like(self.Q) self.Nq = np.ones_like(self.Nq) self.Predicted_Q = np.zeros_like(self.Q) self.temp_mem = [] self.episode_rewards = [] self.loss = [] each_reward_episode = 0 if self.episode_num == 0: if self.init_action < len(self.action_spac): env_action = int(self.init_action) self.init_action += 1 else: self.init_action = 0 env_action = int(self.init_action) reset = False new_obs, rew, done, info = self.env.step(env_action) action = env_action #print(new_obs) if new_obs[0] > 0.48: rew = 10 self.counter_success += 1 print('****** reward ********') #each_reward_episode+=10 self.new_state = new_obs if self.my_counter > 1: self.UpdateQ(self.current_state, self.new_state, env_action, rew) each_reward_episode += rew self.current_state = self.new_state if new_obs[0] > 0.48 or done: if self.episode_num % 50 == 0: print(self.episode_num, self.my_counter, '--> ', each_reward_episode, new_obs[0], done) self.episode_rewards.append(each_reward_episode) each_reward_episode = 0 self.episode_num += 1 self.num_timesteps += 1 self.my_counter += 1 # Stop training if return value is False callback.update_locals(locals()) if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs( ).squeeze() reward_ = self._vec_normalize_env.get_original_reward( ).squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, rew # Store transition in the replay buffer. self.replay_buffer_add(obs_, action, reward_, new_obs_, done, info) obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ if writer is not None: ep_rew = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += reward_ if done: maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: callback.on_rollout_end() # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps), env=self._vec_normalize_env) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size, env=self._vec_normalize_env) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities( batch_idxes, new_priorities) callback.on_rollout_start() if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="A2C", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) self.learning_rate_schedule = Scheduler( initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) runner = A2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma) self.episode_reward = np.zeros((self.n_envs, )) # Training stats (when using Monitor wrapper) ep_info_buf = deque(maxlen=100) t_start = time.time() for update in range(1, total_timesteps // self.n_batch + 1): # true_reward is the reward without discount obs, states, rewards, masks, actions, values, ep_infos, true_reward = runner.run( ) ep_info_buf.extend(ep_infos) _, value_loss, policy_entropy = self._train_step( obs, states, rewards, masks, actions, values, self.num_timesteps // (self.n_batch + 1), writer) n_seconds = time.time() - t_start fps = int((update * self.n_batch) / n_seconds) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) self.num_timesteps += self.n_batch + 1 if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", self.num_timesteps) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(explained_var)) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'ep_len_mean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.dump_tabular() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="SAC", print_freq=100): with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) start_time = time.time() episode_rewards = [0.0] is_teleop_env = hasattr(self.env, "wait_for_teleop_reset") # TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() else: obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) ep_len = 0 self.n_updates = 0 infos_values = [] mb_infos_vals = [] for step in range(total_timesteps): # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy. if step < self.learning_starts: action = self.env.action_space.sample() # No need to rescale when sampling random action rescaled_action = action else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) ep_len += 1 if print_freq > 0 and ep_len % print_freq == 0 and ep_len > 0: print("{} steps".format(ep_len)) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, step) if ep_len > self.train_freq: print("Additional training") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) done = True episode_rewards[-1] += reward if done: if not (isinstance(self.env, VecEnv) or is_teleop_env): obs = self.env.reset() print("Episode finished. Reward: {:.2f} {} Steps".format( episode_rewards[-1], ep_len)) episode_rewards.append(0.0) ep_len = 0 mb_infos_vals = self.optimize(step, writer, current_lr) # Refresh obs when using TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv( 'ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", self.n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', "{:.2f}".format(time.time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", step) logger.dumpkvs() # Reset infos: infos_values = [] if is_teleop_env: self.env.is_training = False # Use last batch print("Final optimization before saving") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) return self
def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_true_rewards = [0.0] episode_successor_features = [ np.zeros(self.observation_space.shape[0]) ] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() h_step = 0 # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() n_updates = 0 infos_values = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() for step in range(total_timesteps): # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if self.num_timesteps < self.learning_starts or np.random.rand( ) < self.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.action_space, unscaled_action) else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # inferred actions need to be transformed to environment action_space before stepping unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape if self.using_mdal: # reward = reward_giver.get_reward(observation, (step+1) * covariance_lambda) # covariance_lambda = (step+1) / (step + 2) * covariance_lambda \ # + np.matmul(np.expand_dims(observation, axis=1), np.expand_dims(observation, axis=0))\ # / (step + 2) reward = self.reward_giver.get_reward(obs) new_obs, true_reward, done, info = self.env.step( unscaled_action) else: new_obs, reward, done, info = self.env.step( unscaled_action) true_reward = reward self.num_timesteps += 1 # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. callback.update_locals(locals()) if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs( ).squeeze() reward_ = self._vec_normalize_env.get_original_reward( ).squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, reward true_reward_ = true_reward # Store transition in the replay buffer. self.replay_buffer_add(obs_, action, reward_, new_obs_, done, info) obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: self.ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward_]).reshape((1, -1)) ep_true_reward = np.array([true_reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) # tf_util.total_episode_reward_logger(self.episode_reward, ep_reward, # ep_done, writer, self.num_timesteps) tf_util.total_episode_reward_logger( self.episode_reward, ep_true_reward, ep_done, writer, self.num_timesteps) if self.num_timesteps % self.train_freq == 0: callback.on_rollout_end() mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) # Update Rewards if self.using_mdal and self.num_timesteps % self.update_reward_freq == 0 and self.num_timesteps > 1: policy_successor_features = np.mean( episode_successor_features[-11:-1], axis=0) self.reward_giver.update_reward(policy_successor_features) callback.on_rollout_start() episode_rewards[-1] += reward_ episode_true_rewards[-1] += true_reward_ if done: if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) episode_true_rewards.append(0.0) episode_successor_features.append( np.zeros(self.observation_space.shape[0])) h_step = 0 else: episode_successor_features[-1] = np.add( episode_successor_features[-1], (1 - self.gamma) * (self.gamma**h_step) * obs_) h_step += 1 maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf mean_true_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) mean_true_reward = round( float(np.mean(episode_true_rewards[-101:-1])), 1) # substract 1 as we appended a new term just now num_episodes = len(episode_rewards) - 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and num_episodes % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv("mean 100 episode true reward", mean_true_reward) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'eplenmean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_log_name="TD3", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) last_replay_update = 0 if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] self.active_sampling = False initial_step = self.num_timesteps if self.buffer_is_prioritized and \ ((replay_wrapper is not None and self.replay_buffer.replay_buffer.__name__ == "ReplayBuffer") or (replay_wrapper is None and self.replay_buffer.__name__ == "ReplayBuffer")) \ and self.num_timesteps >= self.prioritization_starts: self._set_prioritized_buffer() for step in range(initial_step, total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if (self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration): # No need to rescale when sampling random action rescaled_action = action = self.env.action_space.sample() else: action = self.policy_tf.step(obs[None]).flatten() # Add noise to the action, as the policy # is deterministic, this is required for exploration if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) # Store transition in the replay buffer. self.replay_buffer.add( obs, action, reward, new_obs, float(done if not self.time_aware else done and info["termination"] != "steps")) obs = new_obs if ((replay_wrapper is not None and self.replay_buffer.replay_buffer.__name__ == "RankPrioritizedReplayBuffer")\ or self.replay_buffer.__name__ == "RankPrioritizedReplayBuffer") and \ self.num_timesteps % self.buffer_size == 0: self.replay_buffer.rebalance() # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - self.num_timesteps / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) # Note: the policy is updated less frequently than the Q functions # this is controlled by the `policy_delay` parameter step_writer = writer if grad_step % self.write_freq == 0 else None mb_infos_vals.append( self._train_step(step, step_writer, current_lr, (step + grad_step) % self.policy_delay == 0)) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done: if isinstance(self.replay_buffer, DiscrepancyReplayBuffer ) and n_updates - last_replay_update >= 5000: self.replay_buffer.update_priorities() last_replay_update = n_updates if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): if self.active_sampling: sample_obs, sample_state = self.env.get_random_initial_states( 25) obs_discrepancies = self.policy_tf.get_q_discrepancy( sample_obs) obs = self.env.reset( **sample_state[np.argmax(obs_discrepancies)]) else: obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 if self.buffer_is_prioritized and \ ((replay_wrapper is not None and self.replay_buffer.replay_buffer.__name__ == "ReplayBuffer") or (replay_wrapper is None and self.replay_buffer.__name__ == "ReplayBuffer"))\ and self.num_timesteps >= self.prioritization_starts: self._set_prioritized_buffer() # Display training infos if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_discrim_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() self.episode_reward = np.zeros((1,)) ep_info_buf = deque(maxlen=100) discrim_ac_buf = deque(maxlen=20) discrim_train_ac_buf = deque(maxlen=20) discrim_loss_buf = deque(maxlen=200) n_updates = 0 infos_values = [] discrim_loss_val = 0 discrim_ac_val = 0 current_discrim_weight = None current_discrim_lr = None for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.action_space, unscaled_action) else: action = self.policy_tf.step(obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # inferred actions need to be transformed to environment action_space before stepping unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(unscaled_action) skill = info['skill'] agent_discrim_components, expert_discrim_components = [info['prev_agent_obs'].flat], [info['prev_expert_obs'].flat] if self.discrim_include_next_state: agent_discrim_components.append(info['agent_obs'].flat) expert_discrim_components.append(info['expert_obs'].flat) if self.discrim_include_skill: agent_discrim_components.append(skill) expert_discrim_components.append(skill) agent_discrim_obs = np.concatenate(agent_discrim_components, axis=0) expert_discrim_obs = np.concatenate(expert_discrim_components, axis=0) # Collect Entire trajectories of data for discrim test buffer vs regular discrim buffer if len(episode_rewards) % self.discrim_test_data_collection_freq == 0 and len(episode_rewards) >= self.discrim_test_data_collection_freq: # if step % self.discrim_test_data_collection_freq: self.test_discrim_replay_buffer.add(agent_discrim_obs, None, None, np.array([0.95]), 0.0) self.test_discrim_replay_buffer.add(expert_discrim_obs, None, None, np.array([0.05]), 0.0) else: self.discrim_replay_buffer.add(agent_discrim_obs, None, None, np.array([0.95]), 0.0) self.discrim_replay_buffer.add(expert_discrim_obs, None, None, np.array([0.05]), 0.0) # Store transition in the replay buffer. self.replay_buffer.add(np.concatenate((obs, agent_discrim_obs)), action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) current_discrim_weight = max(self.discrim_weight * frac, 0.1) if self.discrim_decay else self.discrim_weight # Update policy and critics (q functions) mb_infos_vals.append(self._train_step(step, writer, current_lr, current_discrim_weight)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) if step % self.discrim_train_freq == 0 and float(step) / total_timesteps < self.discrim_stop: mb_discrim_loss = [] for grad_step in range(self.discrim_grad_steps): if not self.discrim_replay_buffer.can_sample(self.discrim_batch_size): break current_discrim_lr = self.discrim_lr * max(0, 1.0 - float(step) / (self.discrim_stop * total_timesteps)) discrim_loss = self._discrim_train_step(step, current_discrim_lr) mb_discrim_loss.append(discrim_loss) if len(mb_discrim_loss) > 0: discrim_loss_val = np.mean(mb_discrim_loss) # assess the discrim accuracy if step % self.discrim_test_data_collection_freq and self.test_discrim_replay_buffer.can_sample(self.discrim_batch_size): discrim_ac_buf.append(self._get_discrim_ac()) discrim_train_ac_buf.append(self._get_discrim_ac(test=False)) episode_rewards[-1] += reward cur_discrim_reward, cur_discrim_loss = self._get_discrim_reward(agent_discrim_obs) episode_discrim_rewards[-1] += cur_discrim_reward[0][0] discrim_loss_buf.append(cur_discrim_loss) if done: if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) episode_discrim_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("discrim train loss", discrim_loss_val) if len(discrim_ac_buf) > 0: logger.logkv("discrim test ac", safe_mean([ac for ac in discrim_ac_buf])) if len(discrim_train_ac_buf) > 0: logger.logkv("discrim train ac", safe_mean([ac for ac in discrim_train_ac_buf])) if len(episode_discrim_rewards) > 50: logger.logkv("mean 50 ep discrim rew", safe_mean(episode_discrim_rewards[-50:])) if len(discrim_loss_buf) > 0: logger.logkv("discirm pred loss", safe_mean([l for l in discrim_loss_buf])) if current_discrim_weight: logger.logkv("cur discrim weight", current_discrim_weight) if current_discrim_lr: logger.logkv("cur discrim lr", current_discrim_lr) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_successes = [] obs = self.env.reset() reset = True self.episode_reward = np.zeros((1, )) for _ in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += rew if done: maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() self.num_timesteps += 1 return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) self.episode_reward = np.zeros((self.n_envs,)) ep_info_buf = deque(maxlen=100) t_first_start = time.time() n_updates = total_timesteps // self.n_batch start_decay = n_updates pp_sr_buf = deque(maxlen=5) for update in range(1, n_updates + 1): assert self.n_batch % self.nminibatches == 0 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) if self.curriculum: if 'FetchStack' in self.env.get_attr('spec')[0].id: # Stacking pp_sr = pp_eval_model(self.eval_env, self) pp_sr_buf.append(pp_sr) print('Pick-and-place success rate', np.mean(pp_sr_buf)) if start_decay == n_updates and np.mean(pp_sr_buf) > 0.8: start_decay = update _ratio = np.clip(0.7 - 0.8 * (update - start_decay) / 380, 0.3, 0.7) # from 0.7 to 0.3 elif 'FetchPushWallObstacle' in self.env.get_attr('spec')[0].id: _ratio = max(1.0 - (update - 1.0) / n_updates, 0.0) elif 'MasspointMaze-v3' in self.env.get_attr('spec')[0].id: _ratio = 1.0 - (update - 1.0) / n_updates else: raise NotImplementedError self.env.env_method('set_random_ratio', _ratio) print('Set random_ratio to', self.env.get_attr('random_ratio')[0]) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run() self.num_timesteps += self.n_batch ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ((self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append(self._train_step(lr_now, cliprange_now, *slices, writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ((self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append(self._train_step(lr_now, cliprange_now, *slices, update=timestep, writer=writer, states=mb_states, cliprange_vf=cliprange_vf_now)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger(self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('ep_success_rate', safe_mean([ep_info['is_success'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) if self.priority_buffer: self.replay_buffer.set_model(self) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) self.env_id = self.env.env.get_attr('spec')[0].id # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() store_time = 0.0 step_time = 0.0 train_time = 0.0 episode_rewards = [[0.0] for _ in range(self.env.env.num_envs)] episode_successes = [[] for _ in range(self.env.env.num_envs)] if self.action_noise is not None: self.action_noise.reset() assert isinstance(self.env.env, VecEnv) self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] pp_sr_buf = deque(maxlen=5) stack_sr_buf = deque(maxlen=5) start_decay = total_timesteps if self.sequential and 'FetchStack' in self.env_id: current_max_nobject = 2 self.env.env.env_method('set_task_array', [[(2, 0), (2, 1), (1, 0)]] * self.env.env.num_envs) print('Set task_array to ', self.env.env.get_attr('task_array')[0]) self.env.env.env_method('set_random_ratio', [0.7] * self.env.env.num_envs) obs = self.env.reset() print(obs.shape) for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break if self.curriculum and step % 3000 == 0: if 'FetchStack' in self.env.env.get_attr('spec')[0].id: # Stacking pp_sr = eval_model( self.eval_env, self, current_max_nobject if self.sequential else self.env.env.get_attr('n_object')[0], 1.0, init_on_table=(self.env.env.get_attr('spec')[0].id == 'FetchStack-v2')) pp_sr_buf.append(pp_sr) stack_sr = eval_model( self.eval_env, self, current_max_nobject if self.sequential else self.env.env.get_attr('n_object')[0], 0.0, init_on_table=(self.env.env.get_attr('spec')[0].id == 'FetchStack-v2')) stack_sr_buf.append(stack_sr) print('Pick-and-place success rate', np.mean(pp_sr_buf)) if self.sequential: if self.env.env.get_attr('random_ratio')[ 0] > 0.5 and np.mean(pp_sr_buf) > 0.8: _ratio = 0.3 elif self.env.env.get_attr('random_ratio')[0] < 0.5 \ and current_max_nobject < self.env.env.get_attr('n_object')[0] \ and np.mean(stack_sr_buf) > 1 / current_max_nobject: _ratio = 0.7 current_max_nobject += 1 previous_task_array = self.env.env.get_attr( 'task_array')[0] self.env.env.env_method( 'set_task_array', [ previous_task_array + [(current_max_nobject, j) for j in range(current_max_nobject)] ] * self.env.env.num_envs) print('Set task_array to', self.env.env.get_attr('task_array')[0]) else: _ratio = self.env.env.get_attr( 'random_ratio')[0] else: if start_decay == total_timesteps and np.mean( pp_sr_buf) > 0.8: start_decay = step _ratio = np.clip(0.7 - (step - start_decay) / 2e6, 0.3, 0.7) # from 0.7 to 0.3 elif 'FetchPushWallObstacle' in self.env_id: _ratio = max(1.0 - step / total_timesteps, 0.0) else: raise NotImplementedError self.env.env.env_method('set_random_ratio', [_ratio] * self.env.env.num_envs) print('Set random_ratio to', self.env.env.get_attr('random_ratio')[0]) # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if (self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration): rescaled_action = np.stack([ self.env.action_space.sample() for _ in range(self.env.env.num_envs) ], axis=0) action = rescaled_action else: action = self.policy_tf.step(obs, deterministic=False) # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == ( self.env.env.num_envs, ) + self.env.action_space.shape step_time0 = time.time() new_obs, reward, done, info = self.env.step(rescaled_action) step_time += time.time() - step_time0 next_obs = new_obs.copy() for idx, _done in enumerate(done): if _done: next_obs[idx] = self.env.convert_dict_to_obs( info[idx]['terminal_observation']) # Store transition in the replay buffer. store_time0 = time.time() self.replay_buffer.add(obs, action, reward, next_obs, done) store_time += time.time() - store_time0 obs = new_obs for idx, _done in enumerate(done): episode_rewards[idx][-1] += reward[idx] if _done: episode_rewards[idx].append(0.0) maybe_is_success = info[idx].get('is_success') if maybe_is_success is not None: episode_successes[idx].append( float(maybe_is_success)) # Retrieve reward and episode length if using Monitor wrapper for _info in info: maybe_ep_info = _info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.reshape(reward, (self.env.env.num_envs, -1)) ep_done = np.reshape(done, (self.env.env.num_envs, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) train_time0 = time.time() if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) train_time += time.time() - train_time0 if len(episode_rewards[0][-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float( np.mean( np.concatenate([ episode_rewards[i][-101:-1] for i in range(self.env.env.num_envs) ]))), 1) num_episodes = sum([ len(episode_rewards[i]) for i in range(len(episode_rewards)) ]) self.num_timesteps += self.env.env.num_envs # Display training infos if self.verbose >= 1 and done[ 0] and log_interval is not None and len( episode_rewards[0]) % (log_interval // self.env.env.num_envs) == 0: fps = int(self.num_timesteps / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes[0]) > 0: logger.logkv( "success rate", np.mean( np.concatenate([ episode_successes[i][-100:] for i in range(self.env.env.num_envs) ]))) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) if hasattr(self.eval_env.unwrapped, 'random_ratio'): logger.logkv("random_ratio", self.env.env.get_attr('random_ratio')[0]) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="Dual", reset_num_timesteps=True): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) new_tb_log = self._init_num_timesteps(reset_num_timesteps) top_callback = SaveOnTopRewardCallback(check_freq=self.n_steps, logdir=self.tensorboard_log, models_num=self.models_num) callback.append(top_callback) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.models[0].graph, self.tensorboard_log, tb_log_name, new_tb_log) as writer: for model in self.models: model._setup_learn(self) t_first_start = time.time() n_updates = total_timesteps // (self.n_envs * self.n_steps) callback.on_training_start(locals(), globals()) for update in range(1, n_updates + 1): assert (self.n_envs * self.n_steps) % self.nminibatches == 0, ("The number of minibatches (`nminibatches`) is not a factor of the total number of samples collected per rollout (`n_batch`), some samples won't be used.") batch_size = (self.n_envs * self.n_steps) // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) callback.on_rollout_start() rollouts = self.runner.run(callback) #execute episode callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break # Unpack i = 0 steps_used = rollouts[-1] for rollout in rollouts[0]: obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward, success_stages = rollout model = self.models[i] # calc = len(true_reward) # model.n_batch = calc if model.n_batch == 0: b = 0 else: self.ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = max(model.n_batch // self.nminibatches // self.noptepochs, 1) inds = np.arange(len(obs))#np.arange(model.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, model.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ((epoch_num * model.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] if len(obs) > 1: slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append(self._train_step(lr_now, cliprange_now, *slices, model=self.models[i], writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) else: mb_loss_vals.append((0,0,0,0,0)) i+=1 else: exit("does not support recurrent version") loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(model.n_batch / (t_now - t_start)) if writer is not None: n_steps = model.n_batch try: total_episode_reward_logger(self.episode_reward, true_reward.reshape((self.n_envs, n_steps)), masks.reshape((self.n_envs, n_steps)), writer, self.num_timesteps) except: print("Failed to log episode reward of shape {}".format(true_reward.shape)) summary = tf.Summary(value=[tf.Summary.Value(tag='episode_reward/Successful stages', simple_value=success_stages)]) writer.add_summary(summary, self.num_timesteps) #@TODO plot in one graph: for i, val in enumerate(steps_used): summary = tf.Summary(value=[tf.Summary.Value(tag='episode_reward/Used steps net {}'.format(i), simple_value=val)]) writer.add_summary(summary, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("Steps", steps_used) logger.logkv("explained_variance", float(explained_var)) if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0: logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf])) logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, model.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2"): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) # この時点でself.n_steps=256 runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) self.episode_reward = np.zeros((self.n_envs,)) ep_info_buf = deque(maxlen=100) t_first_start = time.time() n_timesteps = 0 # nupdates = total_timesteps // self.n_batch for timestep in range(1, total_timesteps + 1): assert self.n_batch % self.nminibatches == 0 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - timestep / total_timesteps lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run() self.n_steps = len(true_reward) n_timesteps += len(obs) #print("len(obs): {}, np.shape(obs): {}, np.shape(rewards): {}".format(len(obs), np.shape(obs), np.shape(returns))) #print("np.shape(actions): {}, np.shape(rewards): {}".format(np.shape(actions), np.shape(values))) ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): # noptepochs回 np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): # self.n_batch個のデータをbatch_size間隔で等分し_train_stepに渡す. # timestep = ((update * self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // # batch_size) end = start + batch_size #print("batch_size: {}".format(batch_size)) mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, writer=writer, update=n_timesteps)) else: # recurrent version assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for stan_timestepsrt in range(0, self.n_envs, envs_per_batch): # timestep = ((update * self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // # envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, update=n_timesteps, writer=writer, states=mb_states)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger(self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, n_timesteps) if self.verbose >= 1 and (timestep % log_interval == 0 or timestep == 1): explained_var = explained_variance(values, returns) logger.logkv("total_timesteps", n_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break if n_timesteps > total_timesteps: break return self
def run(self, num_steps, data_dir, policy_record=None): local_steps = int(num_steps / self.comm.Get_size()) steps = 0 # TODO: Add alpha annealing over num_steps while True: # sync weights self.b_agent_attack.sync_weights() self.b_agent_evade.sync_weights() self.b_agent_transit.sync_weights() self.m_agent.sync_weights() # create placeholders to store experience that we gather training_state = { "meta": [], "attack": [], "evade": [], "transit": [] } training_action = { "meta": [], "attack": [], "evade": [], "transit": [] } training_reward = { "meta": [], "attack": [], "evade": [], "transit": [] } training_next_state = { "meta": [], "attack": [], "evade": [], "transit": [] } training_done = { "meta": [], "attack": [], "evade": [], "transit": [] } training_reward_sum_combined = 0 # keep track of combined reward over all episodes between training state = self.env.reset() reward_sum = {} done = False while not done: complete_action, distribution, beh_actions, label = self.m_agent.get_action( state) next_state, reward, done, info = self.env.step( complete_action, dst=distribution, label=label) # Aggregate reward throughout the episode if not reward_sum: reward_sum = reward else: reward_sum = { k: reward_sum[k] + reward[k] for (k, v) in reward.items() } training_reward_sum_combined += reward["combined"] training_state["meta"].append(state) training_action["meta"].append(distribution) training_reward["meta"].append(reward["combined"]) training_next_state["meta"].append(next_state) training_done["meta"].append(done) for idx, label in enumerate(['attack', 'evade', 'transit']): training_state[label].append(state) training_action[label].append(beh_actions[idx]) training_reward[label].append(reward[label]) training_next_state[label].append(next_state) training_done[label].append(done) state = next_state # #now we have batches of data: compute the values and advantages # training_value = {"meta": None, "attack": None, "evade": None, "transit": None} # training_advantages = {"meta": None, "attack": None, "evade": None, "transit": None} # log tensorboard {logger.logkv(k, v) for (k, v) in reward_sum.items()} logger.dumpkvs() # vcompute advantages and values models = [ self.b_agent_attack, self.b_agent_evade, self.b_agent_transit, self.m_agent ] for model in models: network = model.label states = training_state[network] actions = training_action[network] reward = training_reward[network] next_states = training_next_state[network] done = training_done[network] # Convert done bools to ints and invert done_int = np.invert(done).astype(np.int) # Generalized advantage estimation (gets advantages to train on and value estimates) target, advantages = GAE(states, actions, reward, next_states, done_int, model.sample_value, T=128, y=0.99, lam=0.95, use_Q=False) # train this model dataset = Dataset(dict(ob=np.asarray(states), ac=np.asarray(actions), atarg=np.asarray(advantages), vtarg=np.asarray(target)), shuffle=True) for k in range(4): for i, batch in enumerate(dataset.iterate_once( len(states))): model.train(batch["ob"], batch["ac"], batch["vtarg"], batch["atarg"], 1.0) print('FINISHED TRAINING EPISODE')
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="PPO2"): with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) runner_resolution = Runner_resolution(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) self.episode_reward = np.zeros((self.n_envs,)) ep_info_buf = deque(maxlen=100) t_first_start = time.time() nupdates = total_timesteps // self.n_batch for update in range(nupdates + 1): assert self.n_batch % self.nminibatches == 0 n_batch_train = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update / (nupdates + 1)) lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) # true_reward is the reward without discount action_omega = random.randint(1,21) for i in range(action_omega): if i ==0: obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run() ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, n_batch_train): timestep = ((update * self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // n_batch_train) end = start + n_batch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, writer=writer, update=timestep)) else: # recurrent version assert self.n_envs % self.nminibatches == 0 envinds = np.arange(self.n_envs) flatinds = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps) envsperbatch = n_batch_train // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(envinds) for start in range(0, self.n_envs, envsperbatch): timestep = ((update * self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envsperbatch) end = start + envsperbatch mb_env_inds = envinds[start:end] mb_flat_inds = flatinds[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, update=timestep, writer=writer, states=mb_states)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger(self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, update * (self.n_batch + 1)) if callback is not None: callback(locals(), globals()) if self.verbose >= 1 and ((update + 1) % log_interval//100 == 0 or update == 0): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", (update + 1) * self.n_steps) logger.logkv("nupdates", (update + 1)) logger.logkv("total_timesteps", (update + 1) * self.n_batch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() elif i>0: obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner_resolution.run() ep_info_buf.extend(ep_infos) mb_loss_vals = [] print("omega",i) if states is None: # nonrecurrent version inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, n_batch_train): timestep = ((update * self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // n_batch_train) end = start + n_batch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, writer=writer, update=timestep)) else: # recurrent version assert self.n_envs % self.nminibatches == 0 envinds = np.arange(self.n_envs) flatinds = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps) envsperbatch = n_batch_train // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(envinds) for start in range(0, self.n_envs, envsperbatch): timestep = ((update * self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envsperbatch) end = start + envsperbatch mb_env_inds = envinds[start:end] mb_flat_inds = flatinds[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, update=timestep, writer=writer, states=mb_states)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger(self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, update * (self.n_batch + 1)) if callback is not None: callback(locals(), globals()) if self.verbose >= 1 and ((update + 1) % log_interval//100 == 0 or update == 0): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", (update + 1) * self.n_steps) logger.logkv("nupdates", (update + 1)) logger.logkv("total_timesteps", (update + 1) * self.n_batch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name=None, reset_num_timesteps=True): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) new_tb_log = self._init_num_timesteps(reset_num_timesteps) tb_log_name = self.model_name with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) self.episode_reward = np.zeros((self.n_envs, )) ep_info_buf = deque(maxlen=100) t_first_start = time.time() n_updates = total_timesteps // self.n_batch n_checkpoints = self.checkpoint_inc // self.n_batch for update in range(1, n_updates + 1): assert self.n_batch % self.nminibatches == 0 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run( ) self.num_timesteps += self.n_batch ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, update=timestep, writer=writer, states=mb_states, cliprange_vf=cliprange_vf_now)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'ep_len_mean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break for ep_info in ep_infos: self.plot_l.append(ep_info['l']) self.plot_r.append(ep_info['r']) self.plot_t.append(ep_info['t']) if update % n_checkpoints == 0: checkpoint = 'model-' + str(self.num_timesteps) with open(self.checkpoint_log, mode='a') as employee_file: employee_writer = csv.writer(employee_file) employee_writer.writerow([checkpoint]) graph_name_csv = self.graph_name + '-' + str( self.num_timesteps) + '.csv' graph_name_sb = self.graph_name + '-' + str( self.num_timesteps) + '.pkl' self.save(graph_name_sb) with open(graph_name_csv, mode='w') as employee_file: employee_writer = csv.writer(employee_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) employee_writer.writerow(['r', 'l', 't']) for i in range(len(self.plot_l)): employee_writer.writerow([ self.plot_r[i], self.plot_l[i], self.plot_t[i] ]) pydrive_util.upload_file(self.drive, graph_name_sb) pydrive_util.upload_file(self.drive, graph_name_csv) return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None, learning_curve=False, test_t=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] self.cumul_reward = [0.0] episode_successes = [] obs = self.env.reset() reset = True self.episode_reward = np.zeros((1, )) # variables for test eval ## test_step = test_t * 3 test_results = {'sum': []} test_ts = [] for _ in range(total_timesteps): ## Test eval period ## if learning_curve and _ % test_step == 0 and _ > 0: print("--> Simulating test period") self.env.reset() test_r = 0.0 for i in range(test_t): feasible_actions = AllocationEnv.get_feasible_actions( obs["board_config"]) action_mask = AllocationEnv.get_action_mask( feasible_actions, self.env.action_space.n) action, _states = self.predict(obs, mask=action_mask) action = AllocationEnv.check_action( obs['board_config'], action) obs, rewards, dones, info = self.env.step(action) test_r += rewards test_results["sum"].append(test_r) test_ts.append(_) self.env.reset() # plot test eval progress plt.plot(test_ts, test_results["sum"]) # plt.errorbar(iteration_cuts, results["mean"], yerr=results["std"], fmt='.k') plt.xlabel("Iteration count") plt.ylabel("Total (sum) test reward") plt.savefig("figs/rl-learning-curve-{}.pdf".format( cfg.vals['prj_name'])) plt.clf() plt.close() # write test eval progress write_results = {} for k, v in test_results.items(): write_results[k] = serialize_floats(v) with open( "output/rl-learning-curve-{}.json".format( cfg.vals['prj_name']), 'w') as f: json.dump(write_results, f) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True feasible_actions = AllocationEnv.get_feasible_actions( obs["board_config"]) action_mask = AllocationEnv.get_action_mask( feasible_actions, self.action_space.n) with self.sess.as_default(): action = self.act(State.get_vec_observation(obs)[None], update_eps=update_eps, **kwargs, mask=action_mask)[0] reset = False # CHECK IF ACTIONS IS FEASIBLE action = AllocationEnv.check_action(obs['board_config'], action) env_action = action new_obs, rew, done, info = self.env.step(env_action) print("action: {} - reward: {} - eps: {:.4}".format( action, rew, update_eps)) print(new_obs['day_vec']) print(new_obs['board_config']) # Store transition in the replay buffer. self.replay_buffer.add(State.get_vec_observation(obs), action, rew, State.get_vec_observation(new_obs), float(done)) obs = new_obs if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += rew self.cumul_reward.append(self.cumul_reward[-1] + rew) if done: maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() print('timestamp: {}'.format(self.num_timesteps, end='\r\n')) self.num_timesteps += 1 return self
def test_hard(tmpdir): logger.configure(tmpdir) # Part One: Test logging outside of the accumulating scope, and within scopes # with two different different logging keys (including a repeat). sb_logger.logkv("no_context", 1) with logger.accumulate_means("disc"): sb_logger.logkv("C", 2) sb_logger.logkv("D", 2) sb_logger.dumpkvs() sb_logger.logkv("C", 4) sb_logger.dumpkvs() with logger.accumulate_means("gen"): sb_logger.logkv("E", 2) sb_logger.dumpkvs() sb_logger.logkv("E", 0) sb_logger.dumpkvs() with logger.accumulate_means("disc"): sb_logger.logkv("C", 3) sb_logger.dumpkvs() sb_logger.dumpkvs() # Writes 1 mean each from "gen" and "disc". expect_raw_gen = {"raw/gen/E": [2, 0]} expect_raw_disc = { "raw/disc/C": [2, 4, 3], "raw/disc/D": [2, '', ''], } expect_default = { "mean/gen/E": [1], "mean/disc/C": [3], "mean/disc/D": [2], "no_context": [1], } _compare_csv_lines(osp.join(tmpdir, "progress.csv"), expect_default) _compare_csv_lines(osp.join(tmpdir, "raw", "gen", "progress.csv"), expect_raw_gen) _compare_csv_lines(osp.join(tmpdir, "raw", "disc", "progress.csv"), expect_raw_disc) # Part Two: # Check that we append to the same logs after the first dump to "means/*". with logger.accumulate_means("disc"): sb_logger.logkv("D", 100) sb_logger.dumpkvs() sb_logger.logkv("no_context", 2) sb_logger.dumpkvs() # Writes 1 mean from "disc". "gen" is blank. expect_raw_gen = {"raw/gen/E": [2, 0]} expect_raw_disc = { "raw/disc/C": [2, 4, 3, ''], "raw/disc/D": [2, '', '', 100], } expect_default = { "mean/gen/E": [1, ''], "mean/disc/C": [3, ''], "mean/disc/D": [2, 100], "no_context": [1, 2], } _compare_csv_lines(osp.join(tmpdir, "progress.csv"), expect_default) _compare_csv_lines(osp.join(tmpdir, "raw", "gen", "progress.csv"), expect_raw_gen) _compare_csv_lines(osp.join(tmpdir, "raw", "disc", "progress.csv"), expect_raw_disc)
def learn(self, total_timesteps, callback=None, seed=None, log_interval=2000, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose):#, TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ #as writer: #self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) frac = 0 t_k = 0 start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() self.episode_reward = np.zeros((1,)) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] #add = 0 for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if (self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration): # No need to rescale when sampling random action rescaled_action = action = self.env.action_space.sample() else: action = self.policy_tf.step(obs[None], deterministic=True).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape #print("action", action) new_obs, reward, done, info = self.env.step(rescaled_action) #print("new obs", new_obs) # Store transition in the replay buffer. #if add < 1: #for _ in range(self.batch_size): self.replay_buffer.add(obs, action, reward, new_obs, float(done)) #add += 1 obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) #if writer is not None: # Write reward per episode to tensorboard #ep_reward = np.array([reward]).reshape((1, -1)) #ep_done = np.array([done]).reshape((1, -1)) #self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward, # ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(1): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break #print("training now........") n_updates += 1 # Compute current learning_rate t_k = self.klconst #* np.sqrt(step / total_timesteps) frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) #t = time.process_time() mb_infos_vals.append(self._train_step(step, current_lr, t_k)) #print("time taken", time.process_time() - t) #print("n updates", n_updates) #if n_updates > 10: # break # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) if step % self.gradient_steps == 0: self.sess.run(self.assign_policy_op) episode_rewards[-1] += reward if done: if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and step % log_interval == 0: #done and log_interval is not None and len(episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("ent_coef", 1-frac) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.logkv("frac", frac) logger.logkv("t_k", t_k) logger.logkv("steps", step) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy. if self.num_timesteps < self.learning_starts: action = self.env.action_space.sample() # No need to rescale when sampling random action rescaled_action = action else: action = self.policy_tf.step( obs, deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): if self.num_timesteps < self.batch_size or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done: if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv( 'ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() t_first_start = time.time() n_updates = total_timesteps // self.n_batch callback.on_training_start(locals(), globals()) for update in range(1, n_updates + 1): assert self.n_batch % self.nminibatches == 0, ( "The number of minibatches (`nminibatches`) " "is not a factor of the total number of samples " "collected per rollout (`n_batch`), " "some samples won't be used.") batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) callback.on_rollout_start() # true_reward is the reward without discount rollout = self.runner.run(callback) # Unpack obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = rollout # Early stopping due to the callback if not self.runner.continue_training: break self.ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, update=timestep, writer=writer, states=mb_states, cliprange_vf=cliprange_vf_now)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'ep_len_mean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A2C", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() self.learning_rate_schedule = Scheduler( initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) t_start = time.time() callback.on_training_start(locals(), globals()) for update in range(1, total_timesteps // self.n_batch + 1): callback.on_rollout_start() # true_reward is the reward without discount rollout = self.runner.run(callback) # unpack obs, states, rewards, masks, actions, values, ep_infos, true_reward = rollout callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break self.ep_info_buf.extend(ep_infos) _, value_loss, policy_entropy = self._train_step( obs, states, rewards, masks, actions, values, self.num_timesteps // self.n_batch, writer) n_seconds = time.time() - t_start fps = int((update * self.n_batch) / n_seconds) if writer is not None: total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", self.num_timesteps) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(explained_var)) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'ep_len_mean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.dump_tabular() callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="TD3", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() n_updates = 0 infos_values = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() for step in range(total_timesteps): # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if self.num_timesteps < self.learning_starts or np.random.rand( ) < self.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.action_space, unscaled_action) else: action = self.policy_tf.step(obs[None]).flatten() # Add noise to the action, as the policy # is deterministic, this is required for exploration if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(unscaled_action) self.num_timesteps += 1 # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback.on_step() is False: break # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: self.ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: callback.on_rollout_end() mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) # Note: the policy is updated less frequently than the Q functions # this is controlled by the `policy_delay` parameter mb_infos_vals.append( self._train_step(step, writer, current_lr, (step + grad_step) % self.policy_delay == 0)) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) callback.on_rollout_start() episode_rewards[-1] += reward if done: if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) # Display training infos if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'eplenmean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="ACKTR", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) self.n_batch = self.n_envs * self.n_steps self.learning_rate_schedule = Scheduler( initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) # FIFO queue of the q_runner thread is closed at the end of the learn function. # As a result, it needs to be redefinied at every call with self.graph.as_default(): with tf.variable_scope( "kfac_apply", reuse=self.trained, custom_getter=tf_util.outer_scope_getter( "kfac_apply")): # Some of the variables are not in a scope when they are create # so we make a note of any previously uninitialized variables tf_vars = tf.global_variables() is_uninitialized = self.sess.run( [tf.is_variable_initialized(var) for var in tf_vars]) old_uninitialized_vars = [ v for (v, f) in zip(tf_vars, is_uninitialized) if not f ] self.train_op, self.q_runner = self.optim.apply_gradients( list(zip(self.grads_check, self.params))) # then we check for new uninitialized variables and initialize them tf_vars = tf.global_variables() is_uninitialized = self.sess.run( [tf.is_variable_initialized(var) for var in tf_vars]) new_uninitialized_vars = [ v for (v, f) in zip(tf_vars, is_uninitialized) if not f and v not in old_uninitialized_vars ] if len(new_uninitialized_vars) != 0: self.sess.run( tf.variables_initializer(new_uninitialized_vars)) self.trained = True runner = A2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma) self.episode_reward = np.zeros((self.n_envs, )) t_start = time.time() coord = tf.train.Coordinator() if self.q_runner is not None: enqueue_threads = self.q_runner.create_threads(self.sess, coord=coord, start=True) else: enqueue_threads = [] # Training stats (when using Monitor wrapper) ep_info_buf = deque(maxlen=100) for update in range(1, total_timesteps // self.n_batch + 1): # true_reward is the reward without discount obs, states, rewards, masks, actions, values, ep_infos, true_reward = runner.run( ) ep_info_buf.extend(ep_infos) policy_loss, value_loss, policy_entropy = self._train_step( obs, states, rewards, masks, actions, values, self.num_timesteps // (self.n_batch + 1), writer) n_seconds = time.time() - t_start fps = int((update * self.n_batch) / n_seconds) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", self.num_timesteps) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(explained_var)) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'ep_len_mean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.dump_tabular() self.num_timesteps += self.n_batch + 1 coord.request_stop() coord.join(enqueue_threads) return self
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="ACKTR", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() self.n_batch = self.n_envs * self.n_steps self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) # FIFO queue of the q_runner thread is closed at the end of the learn function. # As a result, it needs to be redefinied at every call with self.graph.as_default(): with tf.compat.v1.variable_scope("kfac_apply", reuse=self.trained, custom_getter=tf_util.outer_scope_getter("kfac_apply")): # Some of the variables are not in a scope when they are create # so we make a note of any previously uninitialized variables tf_vars = tf.compat.v1.global_variables() is_uninitialized = self.sess.run([tf.compat.v1.is_variable_initialized(var) for var in tf_vars]) old_uninitialized_vars = [v for (v, f) in zip(tf_vars, is_uninitialized) if not f] self.train_op, self.q_runner = self.optim.apply_gradients(list(zip(self.grads_check, self.params))) # then we check for new uninitialized variables and initialize them tf_vars = tf.compat.v1.global_variables() is_uninitialized = self.sess.run([tf.compat.v1.is_variable_initialized(var) for var in tf_vars]) new_uninitialized_vars = [v for (v, f) in zip(tf_vars, is_uninitialized) if not f and v not in old_uninitialized_vars] if len(new_uninitialized_vars) != 0: self.sess.run(tf.compat.v1.variables_initializer(new_uninitialized_vars)) self.trained = True t_start = time.time() coord = tf.train.Coordinator() if self.q_runner is not None: enqueue_threads = self.q_runner.create_threads(self.sess, coord=coord, start=True) else: enqueue_threads = [] callback.on_training_start(locals(), globals()) for update in range(1, total_timesteps // self.n_batch + 1): callback.on_rollout_start() # pytype:disable=bad-unpacking # true_reward is the reward without discount if isinstance(self.runner, PPO2Runner): # We are using GAE rollout = self.runner.run(callback) obs, returns, masks, actions, values, _, states, ep_infos, true_reward = rollout else: rollout = self.runner.run(callback) obs, states, returns, masks, actions, values, ep_infos, true_reward = rollout # pytype:enable=bad-unpacking callback.update_locals(locals()) callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break self.ep_info_buf.extend(ep_infos) policy_loss, value_loss, policy_entropy = self._train_step(obs, states, returns, masks, actions, values, self.num_timesteps // (self.n_batch + 1), writer) n_seconds = time.time() - t_start fps = int((update * self.n_batch) / n_seconds) if writer is not None: total_episode_reward_logger(self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", self.num_timesteps) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(explained_var)) if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0: logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf])) logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf])) logger.dump_tabular() coord.request_stop() coord.join(enqueue_threads) callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2", eval_every_n=5, reset_num_timesteps=True, record_video=False, log_dir=""): # Define model saving variables # Current Iteration is basically the update # Initialise variable _savediter = 0 _counter = (200 // eval_every_n) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) self.episode_reward = np.zeros((self.n_envs, )) ep_info_buf = deque(maxlen=100) t_first_start = time.time() nupdates = total_timesteps // self.n_batch for update in range(1, nupdates + 1): # Do the following except keyboard interrupt the learning process. try: if update % eval_every_n == 1: print("[RAISIM_GYM] Visualizing in RaiSimOgre") obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = \ runner.run(test_mode=True, record_video=record_video, video_name=log_dir+"/"+str(update-1)+".mp4") # print("Average rewards in this test episode ", ep_infos[0]['r']) # model_name = log_dir + "_Iteration_{}".format(update-1) # self.save(model_name) print("Saving video " + log_dir + "/" + str(update - 1) + ".mp4") # tensorboard_log(logger, ep_infos, self.sess) assert self.n_batch % self.nminibatches == 0 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / nupdates lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run( ) ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, writer=writer, update=timestep)) self.num_timesteps += (self.n_batch * self.noptepochs ) // batch_size * update_fac else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel( ) slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, update=timestep, writer=writer, states=mb_states)) self.num_timesteps += (self.n_envs * self.noptepochs ) // envs_per_batch * update_fac loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) # Verbose just mean that it will show you the logger on the terminal screen. if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'ep_len_mean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break except KeyboardInterrupt: print( "You have stopped the learning process by keyboard interrupt. Model Parameter is saved. \n" ) # You can actually save files using the instance of self. save the model parameters. self.save(log_dir + "_Iteration_{}".format(update)) sys.exit() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) self.episode_reward = np.zeros((self.n_envs, )) ep_info_buf = deque(maxlen=100) t_first_start = time.time() nupdates = total_timesteps // self.n_batch for update in range(1, nupdates + 1): assert self.n_batch % self.nminibatches == 0 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / nupdates lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run( ) ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, writer=writer, update=timestep)) self.num_timesteps += (self.n_batch * self.noptepochs ) // batch_size * update_fac else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, update=timestep, writer=writer, states=mb_states)) self.num_timesteps += (self.n_envs * self.noptepochs ) // envs_per_batch * update_fac loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'ep_len_mean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break return self
def learn( self, total_timesteps, rollout_params, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None, train_plots=None, train_plots_path=None, rollout_timesteps=None, ): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) self.rollout_values = [[] for i in range(len(rollout_params["perc"]))] with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name, new_tb_log) as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0, ) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert (not self.prioritized_replay ), "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps, ) rewardPer1000T = [] episode_rewards = [0.0] episode_successes = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() reset = True obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() for _ in range(total_timesteps): if rollout_timesteps is None or total_timesteps - _ < rollout_timesteps: if _ % 1000 == 0 and _ != 0 or _ == total_timesteps - 1: print("computing rollout values") for k, l in enumerate(rollout_params["perc"]): self.rollout_values[k].append( testExpert( paths=[ "Fish/Guppy/validationData/" + elem for elem in os.listdir( "Fish/Guppy/validationData") ], model=self, env=self.env, exp_turn_fraction=rollout_params[ "exp_turn_fraction"], exp_speed=rollout_params["exp_min_dist"], perc=l, deterministic=True, )) if _ % 1000 == 0 and _ != 0: print("timestep", _, "finished") obs = self.env.reset() if not train_plots is None: if _ % train_plots == 0: testModel_(self, train_plots_path, rollout_params, _) # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0.0 else: update_eps = 0.0 # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1.0 - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs["reset"] = reset kwargs[ "update_param_noise_threshold"] = update_param_noise_threshold kwargs["update_param_noise_scale"] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action # check if next state (after action) would be outside of fish tank (CLIPPING) env_state = self.env.get_state() turn_dist = self.env.action(env_action) global_turn = env_state[0][2] + turn_dist[0] coords = [ env_state[0][0] + turn_dist[1] * np.cos(global_turn), env_state[0][1] + turn_dist[1] * np.sin(global_turn), ] changed = False if coords[0] < -0.49: coords[0] = -0.47 changed = True elif coords[0] > 0.49: coords[0] = 0.47 changed = True if coords[1] < -0.49: coords[1] = -0.47 changed = True elif coords[1] > 0.49: coords[1] = 0.47 changed = True if changed: diff = coords - env_state[0, :2] speed = np.linalg.norm(diff) angles = np.arctan2(diff[1], diff[0]) turn = angles - env_state[0, 2] turn = turn - 2 * np.pi if turn > np.pi else turn turn = turn + 2 * np.pi if turn < -np.pi else turn # convert to DQN output dist_turn = np.abs(self.env.turn_rate_bins - turn) dist_speed = np.abs(self.env.speed_bins - speed) # convert to bins turn = np.argmin(dist_turn, axis=0) speed = np.argmin(dist_speed, axis=0) env_action = action = turn * len( self.env.speed_bins) + speed reset = False new_obs, rew, done, info = self.env.step(env_action) self.num_timesteps += 1 # Stop training if return value is False if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs( ).squeeze() reward_ = self._vec_normalize_env.get_original_reward( ).squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, rew # Store transition in the replay buffer, but change reward to 0 (use it for plot later though) self.replay_buffer.add(obs_, action, 0, new_obs_, float(done)) obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ if writer is not None: ep_rew = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += reward_ if done: maybe_is_success = info.get("is_success") if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if (can_sample and self.num_timesteps > self.learning_starts and self.num_timesteps % self.train_freq == 0): callback.on_rollout_end() # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert ( self.beta_schedule is not None ), "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps), env=self._vec_normalize_env, ) ( obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes, ) = experience else: ( obses_t, actions, rewards, obses_tp1, dones, ) = self.replay_buffer.sample( self.batch_size, env=self._vec_normalize_env) # also sample from expert buffer ( obses_t_exp, actions_exp, rewards_exp, obses_tp1_exp, dones_exp, ) = self.expert_buffer.sample( self.batch_size, env=self._vec_normalize_env) weights, batch_idxes = np.ones_like(rewards), None weights_exp, batch_idxes_exp = np.ones_like( rewards_exp), None # pytype:enable=bad-unpacking if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( np.append(obses_t, obses_t_exp, axis=0), np.append(actions, actions_exp.flatten(), axis=0), np.append(rewards, rewards_exp.flatten(), axis=0), np.append(obses_tp1, obses_tp1_exp, axis=0), np.append(obses_tp1, obses_tp1_exp, axis=0), np.append(dones.flatten(), dones_exp.flatten(), axis=0), np.append(weights, weights_exp), sess=self.sess, options=run_options, run_metadata=run_metadata, ) writer.add_run_metadata( run_metadata, "step%d" % self.num_timesteps) else: summary, td_errors = self._train_step( np.append(obses_t, obses_t_exp, axis=0), np.append(actions, actions_exp.flatten(), axis=0), np.append(rewards, rewards_exp.flatten(), axis=0), np.append(obses_tp1, obses_tp1_exp, axis=0), np.append(obses_tp1, obses_tp1_exp, axis=0), np.append(dones.flatten(), dones_exp.flatten(), axis=0), np.append(weights, weights_exp), sess=self.sess, options=run_options, run_metadata=run_metadata, ) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step( np.append(obses_t, obses_t_exp, axis=0), np.append(actions, actions_exp.flatten(), axis=0), np.append(rewards, rewards_exp.flatten(), axis=0), np.append(obses_tp1, obses_tp1_exp, axis=0), np.append(obses_tp1, obses_tp1_exp, axis=0), np.append(dones.flatten(), dones_exp.flatten(), axis=0), np.append(weights, weights_exp), sess=self.sess, ) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities( batch_idxes, new_priorities) callback.on_rollout_start() if (can_sample and self.num_timesteps > self.learning_starts and self.num_timesteps % self.target_network_update_freq == 0): # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if (self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0): logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps)), ) logger.dump_tabular() callback.on_training_end() return self