def strip_reward_array(buffer): fresh_buffer = ReplayBuffer(len(buffer)) print("Copying environment buffer: ") for i in range(len(buffer)): obs_t, action, reward, obs_tp1, done = buffer._storage[i] fresh_buffer.add(obs_t, action, reward[0], obs_tp1, done) return fresh_buffer
def init_buffer(self, fpath=None, buffer_size=None): with open(fpath, 'rb') as f: buffer_env = pickle.load(f) buffer_model = ReplayBuffer(buffer_size) print("Copying environment buffer: ") for i in tqdm(range(len(buffer_env))): obs_t, action, reward, obs_tp1, done = buffer_env._storage[i] buffer_model.add(obs_t, action, reward, obs_tp1, done) return buffer_env, buffer_model
def test_extend_uniform(): nvals = 16 states = [np.random.rand(2, 2) for _ in range(nvals)] actions = [np.random.rand(2) for _ in range(nvals)] rewards = [np.random.rand() for _ in range(nvals)] newstate = [np.random.rand(2, 2) for _ in range(nvals)] done = [np.random.randint(0, 2) for _ in range(nvals)] size = 32 baseline = ReplayBuffer(size) ext = ReplayBuffer(size) for data in zip(states, actions, rewards, newstate, done): baseline.add(*data) states, actions, rewards, newstates, done = map( np.array, [states, actions, rewards, newstate, done]) ext.extend(states, actions, rewards, newstates, done) assert len(baseline) == len(ext) # Check buffers have same values for i in range(nvals): for j in range(5): condition = (baseline.storage[i][j] == ext.storage[i][j]) if isinstance(condition, np.ndarray): # for obs, obs_t1 assert np.all(condition) else: # for done, reward action assert condition
def main(fpath): train_data = pd.read_csv(fpath) n_products = train_data['product'].max() + 1 n_regions = train_data['region'].max() + 1 buffer = ReplayBuffer(size=100000) grouped = train_data.groupby(by='date') prev_state = None for date, chunk in grouped: board_config = np.zeros([n_regions, n_products]) prev_sales = np.zeros([n_regions, n_products]) day = chunk.iloc[0, 8] prev_sales_product = {} prev_placement_cnts = {} for idx, row in chunk.iterrows(): region = row['region'] product = row['product'] prev_sales_product[product] = row['prev_sales'] if row['quantity'] > 0: board_config[region, product] = 1.0 if product not in prev_placement_cnts: prev_placement_cnts[product] = 0 prev_placement_cnts[product] += 1 for p in range(n_products): if p not in prev_placement_cnts: continue sales = prev_sales_product[p] cnt = prev_placement_cnts[p] avg_spatial_sales = sales / cnt regions = board_config[:, p] prev_sales[:, p] = regions * avg_spatial_sales day_vec = State.get_day_vec(day) state = { "day_vec": day_vec, "prev_sales": prev_sales, "board_config": board_config } if prev_state is not None: action = state['board_config'] - prev_state['board_config'] prev_state = state
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy.obs_ph self.processed_next_obs_ph = self.target_policy.processed_obs self.action_target = self.target_policy.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None, ) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") with tf.variable_scope("model", reuse=False): # Create the policy # first return value corresponds to deterministic actions # policy_out corresponds to stochastic actions, used for training # logp_pi is the log probability of actions taken by the policy self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor( self.processed_obs_ph) # Monitor the entropy of the policy, # this is not used for training self.entropy = tf.reduce_mean(self.policy_tf.entropy) # Use two Q-functions to improve performance by reducing overestimation bias. qf1, qf2, value_fn = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph, create_qf=True, create_vf=True) # Q(s,a) qf1_pi, qf2_pi, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, create_qf=True, create_vf=False, reuse=True) # Q(s, pi(a|s)) # Target entropy is used when learning the entropy coefficient if self.target_entropy == 'auto': # automatically set target entropy if needed self.target_entropy = -np.prod( self.env.action_space.shape).astype(np.float32) else: # Force conversion # this will also throw an error for unexpected string self.target_entropy = float(self.target_entropy) # The entropy coefficient or entropy can be learned automatically # see Automating Entropy Adjustment for Maximum Entropy RL section # of https://arxiv.org/abs/1812.05905 if isinstance(self.ent_coef, str) and self.ent_coef.startswith('auto'): # Default initial value of ent_coef when learned init_value = 1.0 if '_' in self.ent_coef: init_value = float(self.ent_coef.split('_')[1]) assert init_value > 0., "The initial value of ent_coef must be greater than 0" self.log_ent_coef = tf.get_variable( 'log_ent_coef', dtype=tf.float32, initializer=np.log(init_value).astype(np.float32)) self.ent_coef = tf.exp(self.log_ent_coef) else: # Force conversion to float # this will throw an error if a malformed string (different from 'auto') # is passed self.ent_coef = float(self.ent_coef) with tf.variable_scope("target", reuse=False): # Create the value network _, _, value_target = self.target_policy.make_critics( self.processed_next_obs_ph, create_qf=False, create_vf=True) self.value_target = value_target with tf.variable_scope("loss", reuse=False): # Take the min of the two Q-Values (Double-Q Learning) min_qf_pi = tf.minimum(qf1_pi, qf2_pi) # Target for Q value regression q_backup = tf.stop_gradient(self.rewards_ph + (1 - self.terminals_ph) * self.gamma * self.value_target) # Compute Q-Function loss # TODO: test with huber loss (it would avoid too high values) qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1)**2) qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2)**2) # Compute the entropy temperature loss # it is used when the entropy coefficient is learned ent_coef_loss, entropy_optimizer = None, None if not isinstance(self.ent_coef, float): ent_coef_loss = -tf.reduce_mean( self.log_ent_coef * tf.stop_gradient(logp_pi + self.target_entropy)) entropy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) # Compute the policy loss # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi) policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi - qf1_pi) # NOTE: in the original implementation, they have an additional # regularization loss for the Gaussian parameters # this is not used for now # policy_loss = (policy_kl_loss + policy_regularization_loss) policy_loss = policy_kl_loss # Target for value fn regression # We update the vf towards the min of two Q-functions in order to # reduce overestimation bias from function approximation error. v_backup = tf.stop_gradient(min_qf_pi - self.ent_coef * logp_pi) value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup)**2) values_losses = qf1_loss + qf2_loss + value_loss # Policy train op # (has to be separate from value train op, because min_qf_pi appears in policy_loss) policy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize( policy_loss, var_list=get_vars('model/pi')) # Value train op value_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) values_params = get_vars('model/values_fn') source_params = get_vars("model/values_fn/vf") target_params = get_vars("target/values_fn/vf") # Polyak averaging for target variables self.target_update_op = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] # Control flow is used because sess.run otherwise evaluates in nondeterministic order # and we first need to compute the policy action before computing q values losses with tf.control_dependencies([policy_train_op]): train_values_op = value_optimizer.minimize( values_losses, var_list=values_params) self.infos_names = [ 'policy_loss', 'qf1_loss', 'qf2_loss', 'value_loss', 'entropy' ] # All ops to call during one training step self.step_ops = [ policy_loss, qf1_loss, qf2_loss, value_loss, qf1, qf2, value_fn, logp_pi, self.entropy, policy_train_op, train_values_op ] # Add entropy coefficient optimization operation if needed if ent_coef_loss is not None: with tf.control_dependencies([train_values_op]): ent_coef_op = entropy_optimizer.minimize( ent_coef_loss, var_list=self.log_ent_coef) self.infos_names += [ 'ent_coef_loss', 'ent_coef' ] self.step_ops += [ ent_coef_op, ent_coef_loss, self.ent_coef ] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('value_loss', value_loss) tf.summary.scalar('entropy', self.entropy) if ent_coef_loss is not None: tf.summary.scalar('ent_coef_loss', ent_coef_loss) tf.summary.scalar('ent_coef', self.ent_coef) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = get_vars("model") self.target_params = get_vars("target/values_fn/vf") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all()
class DQN(OffPolicyRLModel): """ The DQN model class. DQN paper: https://arxiv.org/pdf/1312.5602.pdf :param policy: (DQNPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) discount factor :param learning_rate: (float) learning rate for adam optimizer :param buffer_size: (int) size of the replay buffer :param exploration_fraction: (float) fraction of entire training period over which the exploration rate is annealed :param exploration_final_eps: (float) final value of random action probability :param train_freq: (int) update the model every `train_freq` steps. set to None to disable printing :param batch_size: (int) size of a batched sampled from replay buffer for training :param checkpoint_freq: (int) how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. :param checkpoint_path: (str) replacement path used if you need to log to somewhere else than a temporary directory. :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param target_network_update_freq: (int) update the target network every `target_network_update_freq` steps. :param prioritized_replay: (bool) if True prioritized replay buffer will be used. :param prioritized_replay_alpha: (float) alpha parameter for prioritized replay buffer :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities. :param param_noise: (bool) Whether or not to apply noise to the parameters of the policy. :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance """ def __init__(self, policy, env, gamma=0.99, learning_rate=5e-4, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, verbose=0, tensorboard_log=None, _init_setup_model=True): # TODO: replay_buffer refactoring super(DQN, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=DQNPolicy, requires_vec_env=False) self.checkpoint_path = checkpoint_path self.param_noise = param_noise self.learning_starts = learning_starts self.train_freq = train_freq self.prioritized_replay = prioritized_replay self.prioritized_replay_eps = prioritized_replay_eps self.batch_size = batch_size self.target_network_update_freq = target_network_update_freq self.checkpoint_freq = checkpoint_freq self.prioritized_replay_alpha = prioritized_replay_alpha self.prioritized_replay_beta0 = prioritized_replay_beta0 self.prioritized_replay_beta_iters = prioritized_replay_beta_iters self.exploration_final_eps = exploration_final_eps self.exploration_fraction = exploration_fraction self.buffer_size = buffer_size self.learning_rate = learning_rate self.gamma = gamma self.tensorboard_log = tensorboard_log self.graph = None self.sess = None self._train_step = None self.step_model = None self.update_target = None self.act = None self.proba_step = None self.replay_buffer = None self.beta_schedule = None self.exploration = None self.params = None self.summary = None self.episode_reward = None if _init_setup_model: self.setup_model() def setup_model(self): with SetVerbosity(self.verbose): assert not isinstance(self.action_space, gym.spaces.Box), \ "Error: DQN cannot output a gym.spaces.Box action space." # If the policy is wrap in functool.partial (e.g. to disable dueling) # unwrap it to check the class type if isinstance(self.policy, partial): test_policy = self.policy.func else: test_policy = self.policy assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \ "an instance of DQNPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(graph=self.graph) optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) self.act, self._train_step, self.update_target, self.step_model = deepq.build_train( q_func=self.policy, ob_space=self.observation_space, ac_space=self.action_space, optimizer=optimizer, gamma=self.gamma, grad_norm_clipping=10, param_noise=self.param_noise, sess=self.sess) self.proba_step = self.step_model.proba_step self.params = find_trainable_variables("deepq") # Initialize the parameters and copy them to the target network. tf_util.initialize(self.sess) self.update_target(sess=self.sess) self.summary = tf.summary.merge_all() def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN"): with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] obs = self.env.reset() reset = True self.episode_reward = np.zeros((1, )) for step in range(total_timesteps): if callback is not None: callback(locals(), globals()) # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(step) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(step) + self.exploration.value(step) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = self.env.step(env_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, step) episode_rewards[-1] += rew if done: if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True if step > self.learning_starts and step % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(step)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + step) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % step) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, step) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) if step > self.learning_starts and step % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", step) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(step))) logger.dump_tabular() return self def predict(self, observation, state=None, mask=None, deterministic=True): observation = np.array(observation) vectorized_env = self._is_vectorized_observation( observation, self.observation_space) observation = observation.reshape((-1, ) + self.observation_space.shape) with self.sess.as_default(): actions, _, _ = self.step_model.step(observation, deterministic=deterministic) if not vectorized_env: actions = actions[0] return actions, None def action_probability(self, observation, state=None, mask=None): observation = np.array(observation) vectorized_env = self._is_vectorized_observation( observation, self.observation_space) observation = observation.reshape((-1, ) + self.observation_space.shape) actions_proba = self.proba_step(observation, state, mask) if not vectorized_env: if state is not None: raise ValueError( "Error: The environment must be vectorized when using recurrent policies." ) actions_proba = actions_proba[0] return actions_proba def save(self, save_path): # params data = { "checkpoint_path": self.checkpoint_path, "param_noise": self.param_noise, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "prioritized_replay": self.prioritized_replay, "prioritized_replay_eps": self.prioritized_replay_eps, "batch_size": self.batch_size, "target_network_update_freq": self.target_network_update_freq, "checkpoint_freq": self.checkpoint_freq, "prioritized_replay_alpha": self.prioritized_replay_alpha, "prioritized_replay_beta0": self.prioritized_replay_beta0, "prioritized_replay_beta_iters": self.prioritized_replay_beta_iters, "exploration_final_eps": self.exploration_final_eps, "exploration_fraction": self.exploration_fraction, "learning_rate": self.learning_rate, "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "_vectorize_action": self._vectorize_action } params = self.sess.run(self.params) self._save_to_file(save_path, data=data, params=params) @classmethod def load(cls, load_path, env=None, **kwargs): data, params = cls._load_from_file(load_path) model = cls(policy=data["policy"], env=env, _init_setup_model=False) model.__dict__.update(data) model.__dict__.update(kwargs) model.set_env(env) model.setup_model() restores = [] for param, loaded_p in zip(model.params, params): restores.append(param.assign(loaded_p)) model.sess.run(restores) return model
class DqnAtml(DQN): def setup_model(self): with SetVerbosity(self.verbose): assert not isinstance(self.action_space, gym.spaces.Box), \ "Error: DQN cannot output a gym.spaces.Box action space." # If the policy is wrap in functool.partial (e.g. to disable dueling) # unwrap it to check the class type if isinstance(self.policy, partial): test_policy = self.policy.func else: test_policy = self.policy assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \ "an instance of DQNPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(graph=self.graph) optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) self.act, self._train_step, self.update_target, self.step_model = build_train_atml( q_func=partial(self.policy, **self.policy_kwargs), ob_space=self.observation_space, ac_space=self.action_space, optimizer=optimizer, gamma=self.gamma, grad_norm_clipping=10, param_noise=self.param_noise, sess=self.sess, full_tensorboard_log=self.full_tensorboard_log) self.proba_step = self.step_model.proba_step self.params = tf_util.get_trainable_vars("deepq") # Initialize the parameters and copy them to the target network. tf_util.initialize(self.sess) self.update_target(sess=self.sess) self.summary = tf.summary.merge_all() def get_actions_vec(self, actions_prims, actions_inputs, actions_mf): with self.sess.as_default(): self.embedd_matrix = self.step_model.embedding.get_weights() invalid_action = np.zeros(self.embedd_matrix[0].shape[1]) - 1 self.embedd_matrix = np.vstack([self.embedd_matrix[0], invalid_action]) embedded_steps = self.embedd_matrix[actions_prims.astype(int)] actions_inputs = actions_inputs.reshape(len(actions_prims), -1) actions_mf = actions_mf.reshape(len(actions_prims), -1) concat_actions = np.concatenate( (embedded_steps, actions_inputs, actions_mf), axis=1) flatten_act = concat_actions.reshape(-1) return flatten_act def process_state_vec(self, obs, state_info): # transform actions representation with embeddings with self.sess.as_default(): self.embedd_matrix = self.step_model.embedding.get_weights() ind1 = state_info['grid_prims_size'] ind2 = ind1 + state_info['relations_size'] ind3 = ind2 + state_info['ff_state_size'] ind4 = ind3 + state_info['action_prims'] ind5 = ind4 + state_info['action_inputs'] ind6 = ind5 + state_info['action_mf'] cells_num = state_info['cells_num'] actions_prims = obs[ind3:ind4] actions_inputs = obs[ind4:ind5] actions_mf = obs[ind5:] flatten_act = self.get_actions_vec(actions_prims, actions_inputs, actions_mf) final_obs = np.concatenate((obs[:ind3], flatten_act)) return final_obs def hierarchical_step(self, obs, ds_rewards, cnt, kwargs, update_eps): register = False while not register: with self.sess.as_default(): action = self.predict(np.array(obs)[None])[0][0] env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) level = info.get('hier_level') register = info.get('register') self.actions_container.append(env_action) self.actions_weights.append(level) if rew < 0 or register: with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] if rew < 0 and not register: self.actions_container = self.actions_container[:-1] self.actions_weights = self.actions_weights[:-1] rep_action = np.zeros(self.action_space.n) rep_action[action] = 1.0 if register: if rew > 0: ds_rewards.append([cnt, rew]) cnt += 1 self.actions_container = np.array(self.actions_container) self.actions_weights = np.array( self.actions_weights) / level b = np.zeros( (len(self.actions_container), self.action_space.n)) b[np.arange(len(self.actions_container)), self.actions_container.astype(int)] = 1 act_replay = np.sum((self.actions_weights * b.T).T, axis=0) rep_action = act_replay / np.sum(act_replay) self.actions_container = [] self.actions_weights = [] self.replay_buffer.add(obs, rep_action, rew, new_obs, float(done)) break obs = new_obs obs = new_obs return obs, new_obs, rew, action, done, reset def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, initial_p=1.0): self.actions_weights = [] self.actions_container = [] new_tb_log = self._init_num_timesteps(reset_num_timesteps) cnt = 0 ds_rewards = [[0, 0]] with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=initial_p, final_p=self.exploration_final_eps) episode_rewards = [0.0] obs = self.env.reset() reset = True self.episode_reward = np.zeros((1, )) for _ in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True ''' Hierarchical Step (Start) ''' obs, new_obs, rew, action, done, reset = self.hierarchical_step( obs, ds_rewards, cnt, kwargs, update_eps) ''' Hierarchical Step (End) ''' if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += rew if done: if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True if self.num_timesteps > self.learning_starts and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: try: new_priorities = np.array([ abs(x) for x in td_errors.tolist() ]) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) except AssertionError: print(td_errors) if self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() self.num_timesteps += 1 return self, ds_rewards
# TRY NOT TO MODIFY: setup the environment env = gym.make(args.gym_id) env.seed(args.seed) env.action_space.np_random.seed(args.seed) env.observation_space.np_random.seed(args.seed) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic input_shape, preprocess_obs_fn = preprocess_obs_space(env.observation_space) output_shape, preprocess_ac_fn = preprocess_ac_space(env.action_space, stochastic=False) # TODO: initialize agent here: er = ReplayBuffer(args.buffer_size) class QNetwork(nn.Module): def __init__(self): super(QNetwork, self).__init__() self.fc1 = nn.Linear(input_shape, 120) self.fc2 = nn.Linear(120, 84) self.fc3 = nn.Linear(84, output_shape) def forward(self, x): x = preprocess_obs_fn(x) x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = self.fc3(x) return x
def learn(self, total_timesteps, seed=None, tb_log_name='DQN', test_interval=1, reset_num_timesteps=True): if reset_num_timesteps: self.num_timesteps = 0 with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) self.replay_buffer = ReplayBuffer(size=self.buffer_size) self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] obs = self.env.reset(train=True) best_train_score = None best_test_score = None self.reward_curve = [] for _ in range(total_timesteps): update_eps = self.exploration.value(self.num_timesteps) with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps)[0] new_obs, rew, done, _ = self.env.step(action) self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if self.num_timesteps > self.learning_starts: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights = np.ones_like(rewards) if writer is not None: if (1 + self.num_timesteps) % 100 == 0: summary, td_errors = self.train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self.train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.num_timesteps > self.learning_starts and self.num_timesteps % self.target_network_update_freq == 0: self.update_target(sess=self.sess) if done: print('-------------------------------------') print('steps | {}'.format( self.num_timesteps)) print('episodes | {}'.format( len(episode_rewards))) epsilon = int(100 * self.exploration.value(self.num_timesteps)) print('% time spent exploring | {}'.format(epsilon)) print('--') mean_100ep_reward = -np.inf if len( episode_rewards[-16:-1]) == 0 else round( float(np.mean(episode_rewards[-16:-1])), 1) self.reward_curve.append(mean_100ep_reward) print('mean 10 episode reward | {:.1f}'.format( mean_100ep_reward)) journal = self.env.sim.journal print('Total operations | {}'.format( len(self.env.sim.journal))) longs = [x for x in journal if x['Type'] == 'LONG'] shorts = [x for x in journal if x['Type'] == 'SHORT'] print('Long/Short | {}/{}'.format( len(longs), len(shorts))) print('Avg duration trades | {:.2f}'.format( np.mean([j['Trade Duration'] for j in journal]))) total_profit = sum([j['Profit'] for j in journal]) print('Total profit | {:.2f}'.format( total_profit)) print('Avg profit per trade | {:.3f}'.format( total_profit / self.env.sim.total_trades)) if epsilon <= self.exploration_final_eps * 100: if best_train_score is None or total_profit > best_train_score: self.save('saves/best_model_train.pkl') best_train_score = total_profit if self.num_timesteps % test_interval == 0: print('--') test_episode_rewards, test_longs, test_shorts, test_ave_profit_per_trade = self.test( ) print('Total profit test > {:.2f}'.format( test_episode_rewards)) print('Long/Short test > {}/{}'.format( test_longs, test_shorts)) print('Avg profit per trade test > {:.3f}'.format( test_ave_profit_per_trade)) if epsilon <= self.exploration_final_eps * 100: if best_test_score is None or test_episode_rewards > best_test_score: self.save('saves/best_model_test.pkl') best_test_score = test_episode_rewards print('-------------------------------------') obs = self.env.reset() episode_rewards.append(0.0) if self.num_timesteps + ( self.num_timesteps / len(episode_rewards)) >= total_timesteps: self.save('saves/final_model.pkl') break self.num_timesteps += 1 return self
def train(self, args, callback, env_kwargs=None, train_kwargs=None): env = self.makeEnv(args, env_kwargs=env_kwargs) # set hyperparameters args.__dict__.update(train_kwargs) self.cuda = th.cuda.is_available() and not args.no_cuda self.device = th.device("cuda" if self.cuda else "cpu") self.using_images = args.srl_model == "raw_pixels" assert not (args.log_states and self.using_images), "SRL logger can only be used with SRL models" if args.log_states: srl_logger = LogRLStates(args.log_dir) else: srl_logger = None self.continuous_actions = args.continuous_actions if args.continuous_actions: action_space = np.prod(env.action_space.shape) else: action_space = env.action_space.n if args.srl_model != "raw_pixels": input_dim = np.prod(env.observation_space.shape) else: n_channels = env.observation_space.shape[-1] # We use an additional CNN when using images # to extract features self.encoder_net = NatureCNN(n_channels).to(self.device) input_dim = 512 # output dim of the encoder net self.policy_net = MLPPolicy(input_dim, action_space).to(self.device) self.q_value_net = MLPQValueNetwork(input_dim, action_space, args.continuous_actions).to(self.device) self.value_net = MLPValueNetwork(input_dim).to(self.device) self.target_value_net = MLPValueNetwork(input_dim).to(self.device) # Make sure target net has the same weights as value_net hardUpdate(source=self.value_net, target=self.target_value_net) value_criterion = nn.MSELoss() q_value_criterion = nn.MSELoss() replay_buffer = ReplayBuffer(args.buffer_size) policy_optimizer = th.optim.Adam(self.policy_net.parameters(), lr=args.learning_rate) value_optimizer = th.optim.Adam(self.value_net.parameters(), lr=args.learning_rate) q_optimizer = th.optim.Adam(self.q_value_net.parameters(), lr=args.learning_rate) obs = env.reset() start_time = time.time() if srl_logger is not None: srl_logger.reset(obs, env.getOriginalObs()) for step in range(args.num_timesteps): action = self.getAction(obs[None]) new_obs, reward, done, info = env.step(action) # Log states if srl_logger is not None: srl_logger.step(new_obs, env.getOriginalObs(), action, reward, done) # Fill the replay buffer replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Callback for plotting and saving best model if callback is not None: callback(locals(), globals()) if done: obs = env.reset() if srl_logger is not None: srl_logger.reset(obs, env.getOriginalObs()) # Update the different networks for _ in range(args.gradient_steps): # Check that there is enough data in the buffer replay if step < args.batch_size: break # Sample a minibatch from the replay buffer batch_obs, actions, rewards, batch_next_obs, dones = map(lambda x: self.toFloatTensor(x), replay_buffer.sample(args.batch_size)) if self.using_images: # Extract features from the images batch_obs = self.encoder_net(channelFirst(batch_obs)) batch_next_obs = self.encoder_net(channelFirst(batch_next_obs)) rewards = rewards.unsqueeze(1) dones = dones.unsqueeze(1) value_pred = self.value_net(batch_obs) q_value = self.q_value_net(batch_obs, actions) # Sample actions and retrieve log proba # pre_tanh_value, mean_policy and log_std are only used for regularization new_actions, log_pi, pre_tanh_value, mean_policy, log_std = self.sampleAction(batch_obs) # Q-Value function loss target_value_pred = self.target_value_net(batch_next_obs) # TD error with reward scaling next_q_value = args.reward_scale * rewards + (1 - dones) * args.gamma * target_value_pred.detach() loss_q_value = 0.5 * q_value_criterion(q_value, next_q_value.detach()) # Value Function loss q_value_new_actions = self.q_value_net(batch_obs, new_actions) next_value = q_value_new_actions - log_pi loss_value = 0.5 * value_criterion(value_pred, next_value.detach()) # Policy Loss # why not log_pi.exp_() ? loss_policy = (log_pi * (log_pi - q_value_new_actions + value_pred).detach()).mean() # Regularization if self.continuous_actions: loss_policy += args.w_reg * sum(map(l2Loss, [mean_policy, log_std])) q_optimizer.zero_grad() # Retain graph if we are using a CNN for extracting features loss_q_value.backward(retain_graph=self.using_images) q_optimizer.step() value_optimizer.zero_grad() loss_value.backward(retain_graph=self.using_images) value_optimizer.step() policy_optimizer.zero_grad() loss_policy.backward() policy_optimizer.step() # Softly update target value_pred network softUpdate(source=self.value_net, target=self.target_value_net, factor=args.soft_update_factor) if (step + 1) % args.print_freq == 0: print("{} steps - {:.2f} FPS".format(step, step / (time.time() - start_time)))
target_loss, target_train_opt, target_saver, target_write_op, target_q_value_index, ) = build_neural_network("target_network") # Start the training process sess = tf.Session() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter("./logs", sess.graph_def) restore_training_variables( "target_network", backup_training_variables("q_network", sess), sess ) random_actions_taken = 0 er = ReplayBuffer(50000) episode_rewards = [] finished_episodes_count = 0 target_network_update_counter = 0 total_timesteps = 0 for i_episode in range(NUM_EPISODES): raw_state = env.reset() done = False episode_reward = 0 skipping_count = 0 for t in range(MAX_NUM_STEPS): total_timesteps += 1 if SKIP_FRAMES == 0 or skipping_count == 0: epsilon = get_explore_rate(total_timesteps) target_network_update_counter += 1 # env.render()
class SAC(OffPolicyRLModel): """ Soft Actor-Critic (SAC) Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor, This implementation borrows code from original implementation (https://github.com/haarnoja/sac) from OpenAI Spinning Up (https://github.com/openai/spinningup) and from the Softlearning repo (https://github.com/rail-berkeley/softlearning/) Paper: https://arxiv.org/abs/1801.01290 Introduction to SAC: https://spinningup.openai.com/en/latest/algorithms/sac.html :param policy: (SACPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) the discount factor :param learning_rate: (float or callable) learning rate for adam optimizer, the same learning rate will be used for all networks (Q-Values, Actor and Value function) it can be a function of the current progress (from 1 to 0) :param buffer_size: (int) size of the replay buffer :param batch_size: (int) Minibatch size for each gradient update :param tau: (float) the soft update coefficient ("polyak update", between 0 and 1) :param ent_coef: (str or float) Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) Controlling exploration/exploitation trade-off. Set it to 'auto' to learn it automatically (and 'auto_0.1' for using 0.1 as initial value) :param train_freq: (int) Update the model every `train_freq` steps. :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param target_update_interval: (int) update the target network every `target_network_update_freq` steps. :param gradient_steps: (int) How many gradient update after each step :param target_entropy: (str or float) target entropy when learning ent_coef (ent_coef = 'auto') :param action_noise: (ActionNoise) the action noise type (None by default), this can help for hard exploration problem. Cf DDPG for the different action noise type. :param random_exploration: (float) Probability of taking a random action (as in an epsilon-greedy strategy) This is not needed for SAC normally but can help exploring when using HER + SAC. This hack was present in the original OpenAI Baselines repo (DDPG + HER) :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation :param full_tensorboard_log: (bool) enable additional logging when using tensorboard Note: this has no effect on SAC logging for now """ def __init__(self, policy, env, args, gamma=0.99, learning_rate=3e-4, buffer_size=50000, learning_starts=200, train_freq=1, batch_size=64, tau=0.005, ent_coef='auto', target_update_interval=1, gradient_steps=1, target_entropy='auto', action_noise=None, random_exploration=0.0, verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False): super(SAC, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=SACPolicy, requires_vec_env=False, policy_kwargs=policy_kwargs) self.buffer_size = buffer_size self.learning_rate = learning_rate self.learning_starts = learning_starts self.train_freq = train_freq self.batch_size = batch_size self.tau = tau # In the original paper, same learning rate is used for all networks # self.policy_lr = learning_rate # self.qf_lr = learning_rate # self.vf_lr = learning_rate # Entropy coefficient / Entropy temperature # Inverse of the reward scale self.ent_coef = ent_coef self.target_update_interval = target_update_interval self.gradient_steps = gradient_steps self.gamma = gamma self.action_noise = action_noise self.random_exploration = random_exploration self.value_fn = None self.graph = None self.replay_buffer = None self.episode_reward = None self.sess = None self.tensorboard_log = tensorboard_log self.verbose = verbose self.params = None self.summary = None self.policy_tf = None self.target_entropy = target_entropy self.full_tensorboard_log = full_tensorboard_log self.obs_target = None self.target_policy = None self.actions_ph = None self.rewards_ph = None self.terminals_ph = None self.observations_ph = None self.action_target = None self.next_observations_ph = None self.value_target = None self.step_ops = None self.target_update_op = None self.infos_names = None self.entropy = None self.target_params = None self.learning_rate_ph = None self.processed_obs_ph = None self.processed_next_obs_ph = None self.log_ent_coef = None if _init_setup_model: self.setup_model() self.args = args self.reward_type = args.reward_type self.name = self.reward_type + '_' self.skew_explore = SkewExploreKDE(env, args) self.goal_update_frequency = 5000 def _get_pretrain_placeholders(self): policy = self.policy_tf # Rescale deterministic_action = self.deterministic_action * np.abs( self.action_space.low) return policy.obs_ph, self.actions_ph, deterministic_action def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): n_cpu = multiprocessing.cpu_count() if sys.platform == 'darwin': n_cpu //= 2 self.sess = tf_util.make_session(num_cpu=n_cpu, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy.obs_ph self.processed_next_obs_ph = self.target_policy.processed_obs self.action_target = self.target_policy.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None, ) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") with tf.variable_scope("model", reuse=False): # Create the policy # first return value corresponds to deterministic actions # policy_out corresponds to stochastic actions, used for training # logp_pi is the log probabilty of actions taken by the policy self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor( self.processed_obs_ph) # Monitor the entropy of the policy, # this is not used for training self.entropy = tf.reduce_mean(self.policy_tf.entropy) # Use two Q-functions to improve performance by reducing overestimation bias. qf1, qf2, value_fn = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph, create_qf=True, create_vf=True) qf1_pi, qf2_pi, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, create_qf=True, create_vf=False, reuse=True) # Target entropy is used when learning the entropy coefficient if self.target_entropy == 'auto': # automatically set target entropy if needed self.target_entropy = -np.prod( self.env.action_space.shape).astype(np.float32) else: # Force conversion # this will also throw an error for unexpected string self.target_entropy = float(self.target_entropy) # The entropy coefficient or entropy can be learned automatically # see Automating Entropy Adjustment for Maximum Entropy RL section # of https://arxiv.org/abs/1812.05905 if isinstance(self.ent_coef, str) and self.ent_coef.startswith('auto'): # Default initial value of ent_coef when learned init_value = 1.0 if '_' in self.ent_coef: init_value = float(self.ent_coef.split('_')[1]) assert init_value > 0., "The initial value of ent_coef must be greater than 0" self.log_ent_coef = tf.get_variable( 'log_ent_coef', dtype=tf.float32, initializer=np.log(init_value).astype(np.float32)) self.ent_coef = tf.exp(self.log_ent_coef) else: # Force conversion to float # this will throw an error if a malformed string (different from 'auto') # is passed self.ent_coef = float(self.ent_coef) with tf.variable_scope("target", reuse=False): # Create the value network _, _, value_target = self.target_policy.make_critics( self.processed_next_obs_ph, create_qf=False, create_vf=True) self.value_target = value_target with tf.variable_scope("loss", reuse=False): # Take the min of the two Q-Values (Double-Q Learning) min_qf_pi = tf.minimum(qf1_pi, qf2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(self.rewards_ph + (1 - self.terminals_ph) * self.gamma * self.value_target) # Compute Q-Function loss # TODO: test with huber loss (it would avoid too high values) qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1)**2) qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2)**2) # Compute the entropy temperature loss # it is used when the entropy coefficient is learned ent_coef_loss, entropy_optimizer = None, None if not isinstance(self.ent_coef, float): ent_coef_loss = -tf.reduce_mean( self.log_ent_coef * tf.stop_gradient(logp_pi + self.target_entropy)) entropy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) # Compute the policy loss # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi) policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi - qf1_pi) # NOTE: in the original implementation, they have an additional # regularization loss for the gaussian parameters # this is not used for now # policy_loss = (policy_kl_loss + policy_regularization_loss) policy_loss = policy_kl_loss # We update the vf towards the min of two Q-functions in order to # reduce overestimation bias from function approximation error. v_backup = tf.stop_gradient(min_qf_pi - self.ent_coef * logp_pi) value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup)**2) values_losses = qf1_loss + qf2_loss + value_loss # Policy train op # (has to be separate from value train op, because min_qf_pi appears in policy_loss) policy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize( policy_loss, var_list=get_vars('model/pi')) # Value train op value_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) values_params = get_vars('model/values_fn') source_params = get_vars("model/values_fn/vf") target_params = get_vars("target/values_fn/vf") # Polyak averaging for target variables self.target_update_op = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] # Control flow is used because sess.run otherwise evaluates in nondeterministic order # and we first need to compute the policy action before computing q values losses with tf.control_dependencies([policy_train_op]): train_values_op = value_optimizer.minimize( values_losses, var_list=values_params) self.infos_names = [ 'policy_loss', 'qf1_loss', 'qf2_loss', 'value_loss', 'entropy' ] # All ops to call during one training step self.step_ops = [ policy_loss, qf1_loss, qf2_loss, value_loss, qf1, qf2, value_fn, logp_pi, self.entropy, policy_train_op, train_values_op ] # Add entropy coefficient optimization operation if needed if ent_coef_loss is not None: with tf.control_dependencies([train_values_op]): ent_coef_op = entropy_optimizer.minimize( ent_coef_loss, var_list=self.log_ent_coef) self.infos_names += [ 'ent_coef_loss', 'ent_coef' ] self.step_ops += [ ent_coef_op, ent_coef_loss, self.ent_coef ] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('value_loss', value_loss) tf.summary.scalar('entropy', self.entropy) if ent_coef_loss is not None: tf.summary.scalar('ent_coef_loss', ent_coef_loss) tf.summary.scalar('ent_coef', self.ent_coef) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = find_trainable_variables("model") self.target_params = find_trainable_variables( "target/values_fn/vf") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all() def _train_step(self, step, writer, learning_rate): # Sample a batch from the replay buffer batch = self.replay_buffer.sample(self.batch_size) batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = batch feed_dict = { self.observations_ph: batch_obs, self.actions_ph: batch_actions, self.next_observations_ph: batch_next_obs, self.rewards_ph: batch_rewards.reshape(self.batch_size, -1), self.terminals_ph: batch_dones.reshape(self.batch_size, -1), self.learning_rate_ph: learning_rate } # out = [policy_loss, qf1_loss, qf2_loss, # value_loss, qf1, qf2, value_fn, logp_pi, # self.entropy, policy_train_op, train_values_op] # Do one gradient step # and optionally compute log for tensorboard if writer is not None: out = self.sess.run([self.summary] + self.step_ops, feed_dict) summary = out.pop(0) writer.add_summary(summary, step) else: out = self.sess.run(self.step_ops, feed_dict) # Unpack to monitor losses and entropy policy_loss, qf1_loss, qf2_loss, value_loss, *values = out # qf1, qf2, value_fn, logp_pi, entropy, *_ = values entropy = values[4] if self.log_ent_coef is not None: ent_coef_loss, ent_coef = values[-2:] return policy_loss, qf1_loss, qf2_loss, value_loss, entropy, ent_coef_loss, ent_coef return policy_loss, qf1_loss, qf2_loss, value_loss, entropy def learn(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] tra_obs = [] ep_count = 0 selected_goal = None tra_count = 0 for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if (self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration): # No need to rescale when sampling random action rescaled_action = action = self.env.action_space.sample() else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) ################################################################# # fit density model and update goal proposing model skew_explore_obs = obs.copy() if isinstance(self.env, HERGoalEnvWrapper): skew_explore_obs_dict = self.env.convert_obs_to_dict( skew_explore_obs) skew_explore_obs = np.array( [skew_explore_obs_dict['observation']]) tra_obs.append(skew_explore_obs[0]) if selected_goal is None: selected_goal = np.array( skew_explore_obs_dict['desired_goal']) else: tra_obs.append(skew_explore_obs) self.skew_explore.update_history(skew_explore_obs, [done]) if (step % self.goal_update_frequency == 0 and step != 0) or step == 2000: logging.info('update buffer') self.skew_explore.activate_buffer() ################################################################# # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): if self.num_timesteps < self.batch_size or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done: self.plot_tra(tra_count, tra_obs, selected_goal) tra_obs = [] selected_goal = None if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() ep_count += 1 episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) tra_count += 1 self.save(self.args.save_path + '/model') if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self def test(self, steps): # self.env.set_goals([[-0.00497384, 0.11419979, 0.32127943, 0.003, 2.5, 0.02]]) # goal = np.array([[-0.07863348, -0.00893711, 0.2746492, -0.0135142, -1.52]]) # door goal = np.array([[-1, 3.5]]) self.env.set_goals(goal) trajectory = [] obs_dict = self.env.reset() obs = np.concatenate( (obs_dict['observation'], obs_dict['achieved_goal'], obs_dict['desired_goal'])) for i in range(steps): action = self.policy_tf.step(obs[None], deterministic=True).flatten() # rescaled_action = action = self.env.action_space.sample() rescaled_action = action * np.abs(self.action_space.low) # if i > 20 and i < 30: # rescaled_action[-1] = -0.9 # if i > 40 and i < 60: # rescaled_action[-1] = 0.9 new_obs, reward, done, info = self.env.step(rescaled_action) obs_dict = new_obs obs = np.concatenate( (obs_dict['observation'], obs_dict['achieved_goal'], obs_dict['desired_goal'])) # state = self.env.sim.get_state().qpos[:8] # if state[-1] > -0.02: # state[-1] = 0 # else: # state[-1] = 1 # # print('[', state[0], ',', state[1], ',', state[2], ',', state[3], ',', state[4], ',', state[5], ',', state[6], ',', state[7], ']') # trajectory.append(state) # np.save('./trajectory', np.array(trajectory)) def plot_tra(self, t, tra_obs, sampled_goal): ## plot g_states = sampled_goal t_states = np.array(tra_obs) fig, (ax1) = plt.subplots(1, 1, figsize=(5, 5)) if self.args.env == 'maze': ax1.set_xlim([-12, 4]) ax1.set_ylim([-6, 6]) elif self.args.env == 'yumi' or self.args.env == 'yumi_box_pick': ax1.set_xlim([ self.skew_explore.x_start - 0.05, self.skew_explore.x_end + 0.05 ]) ax1.set_ylim([ self.skew_explore.y_start - 0.05, self.skew_explore.y_end + 0.05 ]) # scale = (tra_rewards - tra_rewards.min())/(tra_rewards.max() - tra_rewards.min()) ax1.scatter(t_states[:, 0], t_states[:, 1], c='g', s=5) ax1.scatter(g_states[0], g_states[1], c='r') plt.savefig(self.args.save_path + '/' + self.name + str(int(t % 20)) + '.svg') logging_info = 'save trajectory plot as: ' + self.args.save_path + '/' + self.name + str( int(t % 20)) + '.svg' logging.info(logging_info) plt.close() def action_probability(self, observation, state=None, mask=None, actions=None): if actions is None: warnings.warn( "Even thought SAC has a Gaussian policy, it cannot return a distribution as it " "is squashed by an tanh before being scaled and ouputed. Therefore 'action_probability' " "will only work with the 'actions' keyword argument being used. Returning None." ) return None observation = np.array(observation) warnings.warn( "The probabilty of taken a given action is exactly zero for a continuous distribution." "See http://blog.christianperone.com/2019/01/ for a good explanation" ) return np.zeros((observation.shape[0], 1), dtype=np.float32) def predict(self, observation, state=None, mask=None, deterministic=True): observation = np.array(observation) vectorized_env = self._is_vectorized_observation( observation, self.observation_space) observation = observation.reshape((-1, ) + self.observation_space.shape) actions = self.policy_tf.step(observation, deterministic=deterministic) actions = actions.reshape( (-1, ) + self.action_space.shape) # reshape to the correct action shape actions = actions * np.abs( self.action_space.low) # scale the output for the prediction if not vectorized_env: actions = actions[0] return actions, None def save(self, save_path): data = { "learning_rate": self.learning_rate, "buffer_size": self.buffer_size, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "batch_size": self.batch_size, "tau": self.tau, "ent_coef": self.ent_coef if isinstance(self.ent_coef, float) else 'auto', "target_entropy": self.target_entropy, # Should we also store the replay buffer? # this may lead to high memory usage # with all transition inside # "replay_buffer": self.replay_buffer "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "action_noise": self.action_noise, "random_exploration": self.random_exploration, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs } params = self.sess.run(self.params) target_params = self.sess.run(self.target_params) self._save_to_file(save_path, data=data, params=params + target_params) @classmethod def load(cls, load_path, env=None, args=None, **kwargs): data, params = cls._load_from_file(load_path) if 'policy_kwargs' in kwargs and kwargs['policy_kwargs'] != data[ 'policy_kwargs']: raise ValueError( "The specified policy kwargs do not equal the stored policy kwargs. " "Stored kwargs: {}, specified kwargs: {}".format( data['policy_kwargs'], kwargs['policy_kwargs'])) model = cls(policy=data["policy"], env=env, _init_setup_model=False, args=args) model.__dict__.update(data) model.__dict__.update(kwargs) # model.set_env(env) model.setup_model() restores = [] for param, loaded_p in zip(model.params + model.target_params, params): restores.append(param.assign(loaded_p)) model.sess.run(restores) return model
class CLAC(OffPolicyRLModel): """ Capacity-Limited Actor-Critic (CLAC) Off-Policy Capacity Limited Deep Reinforcement Learning with a Stochastic Actor, This implementation borrows code from the Soft Actor-Critic Implementation (https://github.com/haarnoja/sac) from OpenAI Spinning Up (https://github.com/openai/spinningup) from the Softlearning repo (https://github.com/rail-berkeley/softlearning/) and from the Stable-Baseliens implementation (https://github.com/hill-a/stable-baselines/tree/master/stable_baselines/sac) Paper: In Preperation for ICML 2020 :param policy: (CLACPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) the discount factor :param learning_rate: (float or callable) learning rate for adam optimizer, the same learning rate will be used for all networks (Q-Values, Actor and Value function) it can be a function of the current progress (from 1 to 0) :param buffer_size: (int) size of the replay buffer :param batch_size: (int) Minibatch size for each gradient update :param tau: (float) the soft update coefficient ("polyak update", between 0 and 1) :param mut_inf_coef: (str or float) Mutual Information regularization coefficient. Controlling performance/generalization trade-off. Set it to 'auto' to learn it automatically (still in development) (and 'auto_0.1' for using 0.1 as initial value) :param train_freq: (int) Update the model every `train_freq` steps. :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param target_update_interval: (int) update the target network every `target_network_update_freq` steps. :param gradient_steps: (int) How many gradient update after each step :param target_inf: (str or float) target mutual information when learning mut_inf_coef (mut_inf_coef = 'auto') :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation :param full_tensorboard_log: (bool) enable additional logging when using tensorboard Note: this has no effect on CLAC logging for now :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). If None (default), use random seed. Note that if you want completely deterministic results, you must set `n_cpu_tf_sess` to 1. :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations If None, the number of cpu of the current machine will be used. """ def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=1000000, learning_rate_phi=2e-3, learning_starts=100, train_freq=1, batch_size=256, tau=0.005, mut_inf_coef='auto', target_update_interval=1, coef_schedule=None, gradient_steps=1, target_entropy='auto', verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None): super(CLAC, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=CLACPolicy, requires_vec_env=False, policy_kwargs=policy_kwargs, seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.buffer_size = buffer_size self.learning_rate = learning_rate self.learning_starts = learning_starts self.train_freq = train_freq self.batch_size = batch_size self.tau = tau # Same learning rate is used for all networks # self.policy_lr = learning_rate # self.qf_lr = learning_rate # self.vf_lr = learning_rate self.mut_inf_coef = mut_inf_coef self.target_update_interval = target_update_interval self.gradient_steps = gradient_steps self.gamma = gamma self.coef_schedule = coef_schedule self.init_mut_inf_coef = self.mut_inf_coef # Options for MI approximation and related parameters self.learning_rate_phi = learning_rate_phi # Taken from MIRL paper, not altered self.multivariate_mean = None self.multivariate_cov = None self.value_fn = None self.graph = None self.replay_buffer = None self.episode_reward = None self.sess = None self.tensorboard_log = tensorboard_log self.verbose = verbose self.params = None self.summary = None self.policy_tf = None self.target_entropy = target_entropy self.full_tensorboard_log = full_tensorboard_log self.obs_target = None self.target_policy = None self.actions_ph = None self.rewards_ph = None self.terminals_ph = None self.observations_ph = None self.action_target = None self.next_observations_ph = None self.value_target = None self.step_ops = None self.target_update_op = None self.infos_names = None self.entropy = None self.target_params = None self.learning_rate_ph = None self.processed_obs_ph = None self.processed_next_obs_ph = None self.log_mut_inf_coef = None self.logp_phi = None self.logp_pi = None self.tf_logged_reward = float("-inf") self.auto_mut_inf_coef = False if not isinstance(self.mut_inf_coef, float): self.auto_mut_inf_coef = True self.action_history = None self.action_entropy = 1 if _init_setup_model: self.setup_model() def _get_pretrain_placeholders(self): policy = self.policy_tf # Rescale deterministic_action = self.deterministic_action * np.abs( self.action_space.low) return policy.obs_ph, self.actions_ph, deterministic_action def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy.obs_ph self.processed_next_obs_ph = self.target_policy.processed_obs self.action_target = self.target_policy.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') # If the action space is discrete we want if (isinstance(self.env.action_space, Discrete)): self.action_history = np.zeros( (self.env.action_space.n)) self.actions_ph = tf.placeholder( tf.float32, shape=(None, self.env.action_space.n), name='actions') else: self.actions_ph = tf.placeholder( tf.float32, shape=(None, ) + self.action_space.shape, name='actions') self.logp_phi = tf.placeholder(tf.float32, shape=(None, ), name='logp_phi') self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") self.mut_inf_coef_tensor = tf.placeholder( tf.float32, shape=(), name='mut_inf_coef') with tf.variable_scope("model", reuse=False): # Create the policy # first return value corresponds to deterministic actions # policy_out corresponds to stochastic actions, used for training # logp_pi is the log probabilty of actions taken by the policy _, policy_out, logp_pi = self.policy_tf.make_actor( self.processed_obs_ph) # self.logp_pi = logp_pi # Monitor the entropy of the policy, # this is not used for training self.entropy = tf.reduce_mean(self.policy_tf.entropy) # Use two Q-functions to improve performance by reducing overestimation bias. qf1, qf2, value_fn = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph, create_qf=True, create_vf=True) qf1_pi, qf2_pi, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, create_qf=True, create_vf=False, reuse=True) #phi_proba, log_phi_proba = self.policy_tf.make_marginal() # Target entropy is used when learning the entropy coefficient if self.target_entropy == 'auto': # automatically set target entropy if needed self.target_entropy = np.prod( self.env.action_space.shape).astype(np.float32) else: # Force conversion # this will also throw an error for unexpected string self.target_entropy = float(self.target_entropy) # Automatic mutual information coefficient setting is not fully tested if isinstance( self.mut_inf_coef, str) and self.mut_inf_coef.startswith('auto'): # Default initial value of mut_inf_coef when learned init_value = 1.0 if '_' in self.mut_inf_coef: init_value = float(self.mut_inf_coef.split('_')[1]) assert init_value > 0., "The initial value of mut_inf_coef must be greater than 0" self.log_mut_inf_coef = tf.get_variable( 'log_mut_inf_coef', dtype=tf.float32, initializer=np.log(init_value).astype(np.float32)) self.mut_inf_coef = tf.exp(self.log_mut_inf_coef) else: # Force conversion to float # this will throw an error if a malformed string (different from 'auto') # is passed self.mut_inf_coef = float(self.mut_inf_coef) with tf.variable_scope("target", reuse=False): # Create the value network _, _, value_target = self.target_policy.make_critics( self.processed_next_obs_ph, create_qf=False, create_vf=True) self.value_target = value_target with tf.variable_scope("loss", reuse=False): # Take the min of the two Q-Values (Double-Q Learning) min_qf_pi = tf.minimum(qf1_pi, qf2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(self.rewards_ph + (1 - self.terminals_ph) * self.gamma * self.value_target) # Compute Q-Function loss # TODO: test with huber loss (it would avoid too high values) qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1)**2) qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2)**2) # Compute the entropy temperature loss # it is used when the entropy coefficient is learned mut_inf_coef_loss, entropy_optimizer = None, None if not isinstance(self.mut_inf_coef, float): mut_inf_coef_loss = -tf.reduce_mean( # self.log_mut_inf_coef * tf.stop_gradient(logp_pi + self.target_entropy)) # self.log_mut_inf_coef * tf.stop_gradient((-1 * (self.logp_phi - logp_pi)) - self.target_entropy)) self.log_mut_inf_coef * tf.stop_gradient(self.logp_phi - logp_pi - self.target_entropy)) entropy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) # Compute the policy loss # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi) #policy_kl_loss = tf.reduce_mean(self.mut_inf_coef * logp_pi - qf1_pi) policy_kl_loss = tf.reduce_mean( (-1 * self.mut_inf_coef_tensor * (self.logp_phi - logp_pi)) - qf1_pi) # NOTE: in the original implementation, they have an additional # regularization loss for the gaussian parameters # this is not used for now # policy_loss = (policy_kl_loss + policy_regularization_loss) policy_loss = policy_kl_loss # We update the vf towards the min of two Q-functions in order to # reduce overestimation bias from function approximation error. # v_backup = tf.stop_gradient(min_qf_pi - self.mut_inf_coef * logp_pi) # previous tests # v_backup = tf.stop_gradient(min_qf_pi - self.mut_inf_coef * (self.logp_phi - logp_pi)) # Minimzing mutual information v_backup = tf.stop_gradient(min_qf_pi + (self.mut_inf_coef_tensor * (self.logp_phi - logp_pi))) value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup)**2) values_losses = qf1_loss + qf2_loss + value_loss discrete_loss = policy_loss # Policy train op # (has to be separate from value train op, because min_qf_pi appears in policy_loss) policy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) if (isinstance(self.env.action_space, Discrete)): policy_train_op = policy_optimizer.minimize( discrete_loss, var_list=get_vars('model/pi')) else: policy_train_op = policy_optimizer.minimize( policy_loss, var_list=get_vars('model/pi')) # Value train op value_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) values_params = get_vars('model/values_fn') source_params = get_vars("model/values_fn/vf") target_params = get_vars("target/values_fn/vf") # Polyak averaging for target variables self.target_update_op = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] # Control flow is used because sess.run otherwise evaluates in nondeterministic order # and we first need to compute the policy action before computing q values losses with tf.control_dependencies([policy_train_op]): train_values_op = value_optimizer.minimize( values_losses, var_list=values_params) self.infos_names = [ 'policy_loss', 'qf1_loss', 'qf2_loss', 'value_loss', 'entropy', 'mut_inf_coef_loss', 'log_policy', 'log_marginal' ] # All ops to call during one training step self.step_ops = [ policy_loss, qf1_loss, qf2_loss, value_loss, qf1, qf2, value_fn, logp_pi, self.entropy, policy_train_op, train_values_op ] #, phi_train_op] # Add entropy coefficient optimization operation if needed if mut_inf_coef_loss is not None: with tf.control_dependencies([train_values_op]): mut_inf_coef_op = entropy_optimizer.minimize( mut_inf_coef_loss, var_list=self.log_mut_inf_coef) self.infos_names += ['mut_inf_coef'] self.step_ops += [ mut_inf_coef_op, mut_inf_coef_loss, self.mut_inf_coef ] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('value_loss', value_loss) tf.summary.scalar('entropy', self.entropy) if mut_inf_coef_loss is not None: tf.summary.scalar('mut_inf_coef_loss', mut_inf_coef_loss) tf.summary.scalar('mut_inf_coef', self.mut_inf_coef) tf.summary.scalar('log_policy', tf.reduce_mean(logp_pi)) tf.summary.scalar('log_marginal', tf.reduce_mean(self.logp_phi)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) tf.summary.scalar('episode_reward', self.tf_logged_reward) # Retrieve parameters that must be saved self.params = get_vars("model") self.target_params = get_vars("target/values_fn/vf") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all() def _train_step(self, step, writer, learning_rate): # Sample a batch from the replay buffer batch = self.replay_buffer.sample(self.batch_size) batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = batch if (isinstance(self.env.action_space, Discrete)): batch_actions = batch_actions.reshape(self.batch_size, self.env.action_space.n) else: batch_actions = batch_actions.reshape( self.batch_size, self.env.action_space.shape[0]) # Determine the logp_phi based on the current batch: if (isinstance(self.env.action_space, Discrete)): assert (False) # Not implemented # not correct for discrete actions action_count = [ np.count_nonzero(batch_actions == action) for action in batch_actions ] action_count = action_count / len(batch_actions) # assert all values are percentages in: action_count logp_phi = np.log(action_count) else: EPS = 1e-6 # Avoid NaN (prevents division by zero or log of zero) #mu = np.mean(batch_actions,axis=0) #cov = np.cov(batch_actions, rowvar=False) + (np.identity(self.env.action_space.shape[0]) * EPS) mu = self.multivariate_mean cov = self.multivariate_cov if (len(mu) == 1): mu = mu[0] try: multivar = multivariate_normal(mu, cov) logp_phi = multivar.logpdf(batch_actions) # * -1 logp_phi = logp_phi.reshape(self.batch_size, ) except: # Mutual infomration coefficient is too small to contribute anything logp_phi = np.zeros(self.batch_size, ) mut_inf_coef = self.mut_inf_coef # If coinrunner environment #batch_obs = np.squeeze(batch_obs, axis=1) #batch_next_obs = np.squeeze(batch_next_obs, axis=1) feed_dict = { self.observations_ph: batch_obs, self.actions_ph: batch_actions, self.next_observations_ph: batch_next_obs, self.rewards_ph: batch_rewards.reshape(self.batch_size, -1), self.terminals_ph: batch_dones.reshape(self.batch_size, -1), self.learning_rate_ph: learning_rate, self.logp_phi: logp_phi, self.mut_inf_coef_tensor: mut_inf_coef } # out = [policy_loss, qf1_loss, qf2_loss, # value_loss, qf1, qf2, value_fn, logp_pi, # self.entropy, policy_train_op, train_values_op] # Do one gradient step # and optionally compute log for tensorboard if writer is not None: out = self.sess.run([self.summary] + self.step_ops, feed_dict) summary = out.pop(0) writer.add_summary(summary, step) else: out = self.sess.run(self.step_ops, feed_dict) # Unpack to monitor losses and entropy policy_loss, qf1_loss, qf2_loss, value_loss, *values = out #qf1, qf2, value_fn, logp_pi, entropy, *_ = values entropy = values[4] if self.log_mut_inf_coef is not None: mut_inf_coef_loss, mut_inf_coef = values[-2:] return policy_loss, qf1_loss, qf2_loss, value_loss, entropy, mut_inf_coef_loss, mut_inf_coef return policy_loss, qf1_loss, qf2_loss, value_loss, entropy def sample(self, num_samples=1000): samples = [[], [], [], [], []] for state in range(self.observation_space.n): mean = [] for _ in range(num_samples): action = (self.predict(state)[0][0] - self.action_space.low) / (self.action_space.high - self.action_space.low)[0] mean.append(action[0]) samples[state].append(np.mean(mean)) return samples def run(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_log_name="CLAC", reset_num_timesteps=True, randomization=0): start_time = time.time() episode_rewards = [0.0] learning_results = pd.DataFrame() obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] reward_data = pd.DataFrame() for step in range(total_timesteps): if (isinstance(self.env.action_space, Discrete)): actions = list(range(self.env.action_space.n)) action = self.policy_tf.step(obs[None], deterministic=False).flatten() rescaled_action = np.random.choice(actions, 1, p=action)[0] else: action = self.policy_tf.step(obs[None], deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) new_obs, reward, done, info = self.env.step(rescaled_action) act_mu, act_std = self.policy_tf.proba_step(obs[None]) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper # info = info[0] maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) episode_rewards[-1] += reward if done: if not isinstance(self.env, VecEnv): obs = self.env.reset() if (randomization == 1): try: for env in self.env.unwrapped.envs: env.randomize() except: print( "Trying to randomize an environment that is not set up for randomization, check environment file" ) assert (False) if (randomization == 2): try: for env in self.env.unwrapped.envs: env.randomize_extreme() except: print( "Trying to extremely randomize an environment that is not set up for randomization, check environment file" ) assert (False) Model_String = "CLAC" if not self.auto_mut_inf_coef: Model_String = "CLAC " + str(self.init_mut_inf_coef) env_name = self.env.unwrapped.envs[0].spec.id mut_inf_coef = self.init_mut_inf_coef if (type(self.mut_inf_coef) == tf.Tensor or np.isnan(mut_inf_coef)): mut_inf_coef = "auto" Model_String = "CLAC" + str(mut_inf_coef) d = { 'Episode Reward': episode_rewards[-1], 'Coefficient': mut_inf_coef, 'Timestep': self.num_timesteps, 'Episode Number': len(episode_rewards) - 1, 'Env': env_name, 'Randomization': randomization, 'Model': "CLAC" } learning_results = learning_results.append(d, ignore_index=True) self.tf_logged_reward = episode_rewards[-1] episode_rewards.append(0.0) return (self, learning_results) def learn(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_log_name="CLAC", reset_num_timesteps=True, randomization=0): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] learning_results = pd.DataFrame() obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] reward_data = pd.DataFrame() for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy. if self.num_timesteps < self.learning_starts: if (isinstance(self.env.action_space, Discrete)): action = [] for _ in range(self.env.action_space.n): action.append(1 / self.env.action_space.n) rescaled_action = self.env.action_space.sample() else: action = self.env.action_space.sample() # No need to rescale when sampling random action rescaled_action = action else: if (isinstance(self.env.action_space, Discrete)): actions = list(range(self.env.action_space.n)) action = self.policy_tf.step( obs[None], deterministic=False).flatten() rescaled_action = np.random.choice(actions, 1, p=action)[0] else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs( self.action_space.low) if (not isinstance(self.env.action_space, Discrete)): assert action.shape == self.env.action_space.shape # If coinrunner environment # rescaled_action = np.array(rescaled_action, ndmin=1) new_obs, reward, done, info = self.env.step(rescaled_action) act_mu, act_std = self.policy_tf.proba_step(obs[None]) if (len(act_std) == 1): act_std = act_std[0] #print("ACT MU FROM PROBA STEP", act_mu) #print("ACT STD FROM PROBA STEP", act_std) if self.num_timesteps > self.learning_starts: # Only update marginal approximation after learning starts is completed if (self.multivariate_mean is None): self.multivariate_mean = act_mu else: previous_mean = self.multivariate_mean self.multivariate_mean = ( (1 - self.learning_rate_phi) * self.multivariate_mean) + (self.learning_rate_phi * act_mu) if (self.multivariate_cov is None): self.multivariate_cov = np.diag(act_std) else: cov = (self.learning_rate_phi * np.diag(act_std) + (1 - self.learning_rate_phi) * self.multivariate_cov) mom_1 = (self.learning_rate_phi * np.square(np.diag(act_mu))) + ( (1 - self.learning_rate_phi) * np.square(np.diag(previous_mean))) mom_2 = np.square((self.learning_rate_phi * np.diag(act_mu)) + (1 - self.learning_rate_phi) * np.diag(previous_mean)) self.multivariate_cov = cov + mom_1 - mom_2 # Update Beta parameter if coef_schedule is set if (self.coef_schedule is not None and self.mut_inf_coef > 1e-12): # (1 - a) B + a(1/L()) # Loss based update schdule, for later # Currently using linear schedule: self.mut_inf_coef *= (1 - self.coef_schedule) """if(self.num_timesteps % 1000 == 0): print("updated mut_inf_coef: ", self.mut_inf_coef, " at time step ", self.num_timesteps)""" # Store transition in the replay buffer. #print("adding action to replay buffer: ", action) self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper # info = info[0] maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): if self.num_timesteps < self.batch_size or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: for mb_info_val in mb_infos_vals: for mb_info in mb_info_val: if mb_info is not None: infos_values.append(np.mean(mb_info)) #infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done: if not isinstance(self.env, VecEnv): obs = self.env.reset() if (randomization == 1): try: for env in self.env.unwrapped.envs: env.randomize() except: print( "Trying to randomize an environment that is not set up for randomization, check environment file" ) assert (False) if (randomization == 2): try: for env in self.env.unwrapped.envs: env.randomize_extreme() except: print( "Trying to extremely randomize an environment that is not set up for randomization, check environment file" ) assert (False) Model_String = "CLAC" if not self.auto_mut_inf_coef: Model_String = "CLAC " + str(self.mut_inf_coef) env_name = self.env.unwrapped.envs[0].spec.id mut_inf_coef = self.init_mut_inf_coef if (type(self.mut_inf_coef) == tf.Tensor or np.isnan(mut_inf_coef)): mut_inf_coef = "auto" Model_String = "CLAC" + str(mut_inf_coef) d = { 'Episode Reward': episode_rewards[-1], 'Coefficient': mut_inf_coef, 'Timestep': self.num_timesteps, 'Episode Number': len(episode_rewards) - 1, 'Env': env_name, 'Randomization': randomization, 'Model': "CLAC" } learning_results = learning_results.append( d, ignore_index=True) self.tf_logged_reward = episode_rewards[-1] episode_rewards.append(0.0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv( 'ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return (self, learning_results) def action_probability(self, observation, state=None, mask=None, actions=None): if actions is None: warnings.warn( "Even thought CLAC has a Gaussian policy, it cannot return a distribution as it " "is squashed by an tanh before being scaled and ouputed. Therefore 'action_probability' " "will only work with the 'actions' keyword argument being used. Returning None." ) return None observation = np.array(observation) warnings.warn( "The probabilty of taken a given action is exactly zero for a continuous distribution." ) return np.zeros((observation.shape[0], 1), dtype=np.float32) def predict(self, observation, state=None, mask=None, deterministic=False): observation = np.array(observation) vectorized_env = self._is_vectorized_observation( observation, self.observation_space) observation = observation.reshape((-1, ) + self.observation_space.shape) if (isinstance(self.env.action_space, Discrete)): # could replace this with map apply actions = [] action_distributions = self.policy_tf.step(observation, deterministic=False) available_actions = list(range(self.env.action_space.n)) for action_distribution in action_distributions: action = np.random.choice(available_actions, 1, p=action_distribution)[0] actions.append(action) else: actions = self.policy_tf.step(observation, deterministic=False) actions = actions.reshape( (-1, ) + self.action_space.shape) # reshape to the correct action shape actions = actions * np.abs( self.action_space.low) # scale the output for the prediction if not vectorized_env: actions = actions[0] return actions, None def get_parameter_list(self): return (self.params + self.target_params) def save(self, save_path, cloudpickle=False): data = { "learning_rate": self.learning_rate, "buffer_size": self.buffer_size, "learning_starts": self.learning_starts, "multivariate_mean": self.multivariate_mean, "multivariate_cov": self.multivariate_cov, "train_freq": self.train_freq, "batch_size": self.batch_size, "tau": self.tau, "mut_inf_coef": self.mut_inf_coef if isinstance(self.mut_inf_coef, float) else 'auto', "target_entropy": self.target_entropy, "num_timesteps": self.num_timesteps, #"replay_buffer": self.replay_buffer, "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "n_cpu_tf_sess": self.n_cpu_tf_sess, "seed": self.seed, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs, "coef_schedule": self.coef_schedule, "init_mut_inf_coef": self.init_mut_inf_coef } params_to_save = self.get_parameters() self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle)
]) # The algorithms require a vectorized environment to run model = DQN(MlpPolicy, env, verbose=2, learning_starts=LEARNING_START, gamma=.2, exploration_fraction=0.35, exploration_final_eps=0.2) model.learn(total_timesteps=TIME_STEPS, learning_curve=False, test_t=TEST_T) with open(f"../data/{store_id}-buffer-d-test.p", 'wb') as f: pickle.dump(model.replay_buffer, f) results = {'rewards': [0.0]} buffer = ReplayBuffer(size=50000) for j in range(100): obs = env.reset() for i in range(TEST_T): feasible_actions = AllocationEnv.get_feasible_actions( obs["board_config"]) action_mask = AllocationEnv.get_action_mask(feasible_actions, n_actions) action, _states = model.predict(obs, mask=action_mask) action = AllocationEnv.check_action(obs['board_config'], action) new_obs, r, dones, info = env.step([action])
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_successes = [] Globals.env = self.env obs = self.env.reset() reset = True self.episode_reward = np.zeros((1, )) timesteps_last_log = 0 avr_ep_len_per_log = None sleep = 0.045 for _ in range(total_timesteps): if Globals.loading: Globals.loading = False while Globals.pause_game: pass if Globals.exit_learning: break if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += rew if done: maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample: sleep = 0.035 time.sleep(sleep) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) if len(episode_rewards) % log_interval == 0: avr_ep_len_per_log = (self.num_timesteps - timesteps_last_log) / log_interval timesteps_last_log = self.num_timesteps num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.record_tabular("avr length of last logged ep", avr_ep_len_per_log) logger.dump_tabular() self.num_timesteps += 1 Globals.steps -= 1 return self
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): n_cpu = multiprocessing.cpu_count() if sys.platform == 'darwin': n_cpu //= 2 self.sess = tf_util.make_session(num_cpu=n_cpu, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space) self.target_policy = self.policy(self.sess, self.observation_space, self.action_space) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy.obs_ph self.processed_next_obs_ph = self.target_policy.processed_obs self.action_target = self.target_policy.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None, ) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") with tf.variable_scope("model", reuse=False): # Create the policy # mu corresponds to deterministic actions # pi corresponds to stochastic actions, used for training # logp_pi is the log probabilty of action pi _, policy_out, logp_pi = self.policy_tf.make_actor( self.processed_obs_ph) # Monitor the entropy of the policy, # this is not used for training self.entropy = tf.reduce_mean(self.policy_tf.entropy) # Use two Q-functions to improve performance by reducing overestimation bias. qf1, qf2, value_fn = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph, create_qf=True, create_vf=True) qf1_pi, qf2_pi, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, create_qf=True, create_vf=False, reuse=True) with tf.variable_scope("target", reuse=False): # Create the value network _, _, value_target = self.target_policy.make_critics( self.processed_next_obs_ph, create_qf=False, create_vf=True) self.value_target = value_target with tf.variable_scope("loss", reuse=False): # Take the min of the two Q-Values (Double-Q Learning) min_qf_pi = tf.minimum(qf1_pi, qf2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(self.rewards_ph + (1 - self.terminals_ph) * self.gamma * self.value_target) # Compute Q-Function loss # TODO: test with huber loss (it would avoid too high values) qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1)**2) qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2)**2) # Compute the policy loss # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi) policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi - qf1_pi) # NOTE: in the original implementation, they have an additional # regularization loss for the gaussian parameters # this is not used for now # policy_loss = (policy_kl_loss + policy_regularization_loss) policy_loss = policy_kl_loss # We update the vf towards the min of two Q-functions in order to # reduce overestimation bias from function approximation error. v_backup = tf.stop_gradient(min_qf_pi - self.ent_coef * logp_pi) value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup)**2) values_losses = qf1_loss + qf2_loss + value_loss # Policy train op # (has to be separate from value train op, because min_qf_pi appears in policy_loss) policy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize( policy_loss, var_list=get_vars('model/pi')) # Value train op # (control dep of policy_train_op because sess.run otherwise # evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) values_params = get_vars('model/values_fn') source_params = get_vars("model/values_fn/vf") target_params = get_vars("target/values_fn/vf") # Polyak averaging for target variables self.target_update_op = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] # Control flow is used because sess.run otherwise evaluates in nondeterministic order # and we first need to compute the policy action before computing q values losses with tf.control_dependencies([policy_train_op]): train_values_op = value_optimizer.minimize( values_losses, var_list=values_params) self.infos_names = [ 'policy_loss', 'qf1_loss', 'qf2_loss', 'value_loss', 'entropy' ] # All ops to call during one training step self.step_ops = [ policy_loss, qf1_loss, qf2_loss, value_loss, qf1, qf2, value_fn, logp_pi, self.entropy, policy_train_op, train_values_op ] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('value_loss', value_loss) tf.summary.scalar('entropy', self.entropy) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = find_trainable_variables("model") self.target_params = find_trainable_variables( "target/values_fn/vf") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all()
class SAC(OffPolicyRLModel): """ Soft Actor-Critic (SAC) Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor, This implementation borrows code from original implementation (https://github.com/haarnoja/sac) and from OpenAI Spinning Up (https://github.com/openai/spinningup) Paper: https://arxiv.org/abs/1801.01290 Introduction to SAC: https://spinningup.openai.com/en/latest/algorithms/sac.html :param policy: (SACPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) the discount factor :param learning_rate: (float or callable) learning rate for adam optimizer, the same learning rate will be used for all networks (Q-Values, Actor and Value function) it can be a function of the current progress (from 1 to 0) :param buffer_size: (int) size of the replay buffer :param batch_size: (int) Minibatch size for each gradient update :param tau: (float) the soft update coefficient ("polyak update", between 0 and 1) :param ent_coef: (float) Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) Controlling exploration/exploitation trade-off. :param train_freq: (int) Update the model every `train_freq` steps. :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param target_update_interval: (int) update the target network every `target_network_update_freq` steps. :param gradient_steps: (int) How many gradient update after each step :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance """ def __init__(self, policy, env, gamma=0.99, learning_rate=3e-3, buffer_size=50000, learning_starts=100, train_freq=1, batch_size=64, tau=0.005, ent_coef=0.1, target_update_interval=1, gradient_steps=1, verbose=0, tensorboard_log=None, _init_setup_model=True): super(SAC, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=SACPolicy, requires_vec_env=False) self.buffer_size = buffer_size self.learning_rate = learning_rate self.learning_starts = learning_starts self.train_freq = train_freq self.batch_size = batch_size self.tau = tau # In the original paper, same learning rate is used for all networks # self.policy_lr = learning_rate # self.qf_lr = learning_rate # self.vf_lr = learning_rate # Entropy coefficient / Entropy temperature # Inverse of the reward scale self.ent_coef = ent_coef self.target_update_interval = target_update_interval self.gradient_steps = gradient_steps self.gamma = gamma self.value_fn = None self.graph = None self.replay_buffer = None self.episode_reward = None self.sess = None self.tensorboard_log = tensorboard_log self.verbose = verbose self.params = None self.summary = None self.policy_tf = None self.obs_target = None self.target_policy = None self.actions_ph = None self.rewards_ph = None self.terminals_ph = None self.observations_ph = None self.action_target = None self.next_observations_ph = None self.value_target = None self.step_ops = None self.target_update_op = None self.infos_names = None self.entropy = None self.target_params = None self.learning_rate_ph = None if _init_setup_model: self.setup_model() def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): n_cpu = multiprocessing.cpu_count() if sys.platform == 'darwin': n_cpu //= 2 self.sess = tf_util.make_session(num_cpu=n_cpu, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space) self.target_policy = self.policy(self.sess, self.observation_space, self.action_space) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy.obs_ph self.processed_next_obs_ph = self.target_policy.processed_obs self.action_target = self.target_policy.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None, ) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") with tf.variable_scope("model", reuse=False): # Create the policy # mu corresponds to deterministic actions # pi corresponds to stochastic actions, used for training # logp_pi is the log probabilty of action pi _, policy_out, logp_pi = self.policy_tf.make_actor( self.processed_obs_ph) # Monitor the entropy of the policy, # this is not used for training self.entropy = tf.reduce_mean(self.policy_tf.entropy) # Use two Q-functions to improve performance by reducing overestimation bias. qf1, qf2, value_fn = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph, create_qf=True, create_vf=True) qf1_pi, qf2_pi, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, create_qf=True, create_vf=False, reuse=True) with tf.variable_scope("target", reuse=False): # Create the value network _, _, value_target = self.target_policy.make_critics( self.processed_next_obs_ph, create_qf=False, create_vf=True) self.value_target = value_target with tf.variable_scope("loss", reuse=False): # Take the min of the two Q-Values (Double-Q Learning) min_qf_pi = tf.minimum(qf1_pi, qf2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(self.rewards_ph + (1 - self.terminals_ph) * self.gamma * self.value_target) # Compute Q-Function loss # TODO: test with huber loss (it would avoid too high values) qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1)**2) qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2)**2) # Compute the policy loss # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi) policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi - qf1_pi) # NOTE: in the original implementation, they have an additional # regularization loss for the gaussian parameters # this is not used for now # policy_loss = (policy_kl_loss + policy_regularization_loss) policy_loss = policy_kl_loss # We update the vf towards the min of two Q-functions in order to # reduce overestimation bias from function approximation error. v_backup = tf.stop_gradient(min_qf_pi - self.ent_coef * logp_pi) value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup)**2) values_losses = qf1_loss + qf2_loss + value_loss # Policy train op # (has to be separate from value train op, because min_qf_pi appears in policy_loss) policy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize( policy_loss, var_list=get_vars('model/pi')) # Value train op # (control dep of policy_train_op because sess.run otherwise # evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) values_params = get_vars('model/values_fn') source_params = get_vars("model/values_fn/vf") target_params = get_vars("target/values_fn/vf") # Polyak averaging for target variables self.target_update_op = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] # Control flow is used because sess.run otherwise evaluates in nondeterministic order # and we first need to compute the policy action before computing q values losses with tf.control_dependencies([policy_train_op]): train_values_op = value_optimizer.minimize( values_losses, var_list=values_params) self.infos_names = [ 'policy_loss', 'qf1_loss', 'qf2_loss', 'value_loss', 'entropy' ] # All ops to call during one training step self.step_ops = [ policy_loss, qf1_loss, qf2_loss, value_loss, qf1, qf2, value_fn, logp_pi, self.entropy, policy_train_op, train_values_op ] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('value_loss', value_loss) tf.summary.scalar('entropy', self.entropy) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = find_trainable_variables("model") self.target_params = find_trainable_variables( "target/values_fn/vf") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all() def _train_step(self, step, writer, learning_rate): # Sample a batch from the replay buffer batch = self.replay_buffer.sample(self.batch_size) batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = batch feed_dict = { self.observations_ph: batch_obs, self.actions_ph: batch_actions, self.next_observations_ph: batch_next_obs, self.rewards_ph: batch_rewards.reshape(self.batch_size, -1), self.terminals_ph: batch_dones.reshape(self.batch_size, -1), self.learning_rate_ph: learning_rate } # out = [policy_loss, qf1_loss, qf2_loss, # value_loss, qf1, qf2, value_fn, logp_pi, # self.entropy, policy_train_op, train_values_op] # Do one gradient step # and optionally compute log for tensorboard if writer is not None: out = self.sess.run([self.summary] + self.step_ops, feed_dict) summary = out.pop(0) writer.add_summary(summary, step) else: out = self.sess.run(self.step_ops, feed_dict) # Unpack to monitor losses and entropy policy_loss, qf1_loss, qf2_loss, value_loss, *values = out # qf1, qf2, value_fn, logp_pi, entropy, *_ = values entropy = values[4] return policy_loss, qf1_loss, qf2_loss, value_loss, entropy def learn(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_log_name="SAC"): with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy. if step < self.learning_starts: action = self.env.action_space.sample() # No need to rescale when sampling random action rescaled_action = action else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, step) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): if step < self.batch_size or step < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done: if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) # Display training infos if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv( 'ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", step) logger.dumpkvs() # Reset infos: infos_values = [] return self def action_probability(self, observation, state=None, mask=None): # Here there are no action probabilities, as SAC is continuous # therefore we return the action vector return self.predict(observation, state, mask, deterministic=True)[0] def predict(self, observation, state=None, mask=None, deterministic=True): observation = np.array(observation) vectorized_env = self._is_vectorized_observation( observation, self.observation_space) observation = observation.reshape((-1, ) + self.observation_space.shape) actions = self.policy_tf.step(observation, deterministic=deterministic) actions = actions.reshape( (-1, ) + self.action_space.shape) # reshape to the correct action shape actions = actions * np.abs( self.action_space.low) # scale the output for the prediction if not vectorized_env: actions = actions[0] return actions, None def save(self, save_path): data = { "learning_rate": self.learning_rate, "buffer_size": self.buffer_size, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "batch_size": self.batch_size, "tau": self.tau, "ent_coef": self.ent_coef, # Should we also store the replay buffer? # this may lead to high memory usage # with all transition inside # "replay_buffer": self.replay_buffer "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "_vectorize_action": self._vectorize_action } params = self.sess.run(self.params) target_params = self.sess.run(self.target_params) self._save_to_file(save_path, data=data, params=params + target_params) @classmethod def load(cls, load_path, env=None, **kwargs): data, params = cls._load_from_file(load_path) model = cls(policy=data["policy"], env=env, _init_setup_model=False) model.__dict__.update(data) model.__dict__.update(kwargs) model.set_env(env) model.setup_model() restores = [] for param, loaded_p in zip(model.params + model.target_params, params): restores.append(param.assign(loaded_p)) model.sess.run(restores) return model
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy_tf = self.policy( self.sess, self.observation_space, self.action_space, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy_tf.obs_ph self.processed_next_obs_ph = self.target_policy_tf.processed_obs self.action_target = self.target_policy_tf.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None, ) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") with tf.variable_scope("model", reuse=False): # Create the policy self.policy_out = policy_out = self.policy_tf.make_actor( self.processed_obs_ph) # Use two Q-functions to improve performance by reducing overestimation bias qf1, qf2 = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph) # Q value when following the current policy qf1_pi, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, reuse=True) with tf.variable_scope("target", reuse=False): # Create target networks target_policy_out = self.target_policy_tf.make_actor( self.processed_next_obs_ph) # Target policy smoothing, by adding clipped noise to target actions target_noise = tf.random_normal( tf.shape(target_policy_out), stddev=self.target_policy_noise) target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip) # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh) noisy_target_action = tf.clip_by_value( target_policy_out + target_noise, -1, 1) # Q values when following the target policy qf1_target, qf2_target = self.target_policy_tf.make_critics( self.processed_next_obs_ph, noisy_target_action) with tf.variable_scope("loss", reuse=False): # Take the min of the two target Q-Values (clipped Double-Q Learning) min_qf_target = tf.minimum(qf1_target, qf2_target) # Targets for Q value regression q_backup = tf.stop_gradient(self.rewards_ph + (1 - self.terminals_ph) * self.gamma * min_qf_target) # Compute Q-Function loss qf1_loss = tf.reduce_mean((q_backup - qf1)**2) qf2_loss = tf.reduce_mean((q_backup - qf2)**2) qvalues_losses = qf1_loss + qf2_loss # Policy loss: maximise q value self.policy_loss = policy_loss = -tf.reduce_mean(qf1_pi) # Policy train op # will be called only every n training steps, # where n is the policy delay policy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize( policy_loss, var_list=get_vars('model/pi')) self.policy_train_op = policy_train_op # Q Values optimizer qvalues_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) qvalues_params = get_vars('model/values_fn/') # Q Values and policy target params source_params = get_vars("model/") target_params = get_vars("target/") # Polyak averaging for target variables self.target_ops = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] train_values_op = qvalues_optimizer.minimize( qvalues_losses, var_list=qvalues_params) self.infos_names = ['qf1_loss', 'qf2_loss'] # All ops to call during one training step self.step_ops = [ qf1_loss, qf2_loss, qf1, qf2, train_values_op ] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = get_vars("model") self.target_params = get_vars("target/") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all()
class TradingDQN(DQN): def __init__(self, policy, env, gamma=0.9, batch_size=32, buffer_size=100000, learning_starts=10000, learning_rate=0.0001, target_network_update_freq=1000, exploration_final_eps=0.02, exploration_fraction=0.1, tensorboard_log=None, _init_setup_model=True): super().__init__(policy=policy, env=env, gamma=gamma, batch_size=batch_size, buffer_size=buffer_size, learning_starts=learning_starts, learning_rate=learning_rate, target_network_update_freq=target_network_update_freq, exploration_final_eps=exploration_final_eps, exploration_fraction=exploration_fraction, tensorboard_log=tensorboard_log, _init_setup_model=_init_setup_model) def setup_model(self): self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(graph=self.graph) # https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/deepq/build_graph.py self.act, self.train_step, self.update_target, self.step_model = deepq.build_train( q_func=self.policy, ob_space=self.env.observation_space, ac_space=self.env.action_space, optimizer=tf.train.AdamOptimizer( learning_rate=self.learning_rate), gamma=self.gamma, # grad_norm_clipping=1, sess=self.sess) self.params = find_trainable_variables('deepq') tf_util.initialize(self.sess) self.update_target(sess=self.sess) self.summary = tf.summary.merge_all() def learn(self, total_timesteps, seed=None, tb_log_name='DQN', test_interval=1, reset_num_timesteps=True): if reset_num_timesteps: self.num_timesteps = 0 with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) self.replay_buffer = ReplayBuffer(size=self.buffer_size) self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] obs = self.env.reset(train=True) best_train_score = None best_test_score = None self.reward_curve = [] for _ in range(total_timesteps): update_eps = self.exploration.value(self.num_timesteps) with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps)[0] new_obs, rew, done, _ = self.env.step(action) self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if self.num_timesteps > self.learning_starts: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights = np.ones_like(rewards) if writer is not None: if (1 + self.num_timesteps) % 100 == 0: summary, td_errors = self.train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self.train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.num_timesteps > self.learning_starts and self.num_timesteps % self.target_network_update_freq == 0: self.update_target(sess=self.sess) if done: print('-------------------------------------') print('steps | {}'.format( self.num_timesteps)) print('episodes | {}'.format( len(episode_rewards))) epsilon = int(100 * self.exploration.value(self.num_timesteps)) print('% time spent exploring | {}'.format(epsilon)) print('--') mean_100ep_reward = -np.inf if len( episode_rewards[-16:-1]) == 0 else round( float(np.mean(episode_rewards[-16:-1])), 1) self.reward_curve.append(mean_100ep_reward) print('mean 10 episode reward | {:.1f}'.format( mean_100ep_reward)) journal = self.env.sim.journal print('Total operations | {}'.format( len(self.env.sim.journal))) longs = [x for x in journal if x['Type'] == 'LONG'] shorts = [x for x in journal if x['Type'] == 'SHORT'] print('Long/Short | {}/{}'.format( len(longs), len(shorts))) print('Avg duration trades | {:.2f}'.format( np.mean([j['Trade Duration'] for j in journal]))) total_profit = sum([j['Profit'] for j in journal]) print('Total profit | {:.2f}'.format( total_profit)) print('Avg profit per trade | {:.3f}'.format( total_profit / self.env.sim.total_trades)) if epsilon <= self.exploration_final_eps * 100: if best_train_score is None or total_profit > best_train_score: self.save('saves/best_model_train.pkl') best_train_score = total_profit if self.num_timesteps % test_interval == 0: print('--') test_episode_rewards, test_longs, test_shorts, test_ave_profit_per_trade = self.test( ) print('Total profit test > {:.2f}'.format( test_episode_rewards)) print('Long/Short test > {}/{}'.format( test_longs, test_shorts)) print('Avg profit per trade test > {:.3f}'.format( test_ave_profit_per_trade)) if epsilon <= self.exploration_final_eps * 100: if best_test_score is None or test_episode_rewards > best_test_score: self.save('saves/best_model_test.pkl') best_test_score = test_episode_rewards print('-------------------------------------') obs = self.env.reset() episode_rewards.append(0.0) if self.num_timesteps + ( self.num_timesteps / len(episode_rewards)) >= total_timesteps: self.save('saves/final_model.pkl') break self.num_timesteps += 1 return self def test(self): obs = self.env.reset(train=False) done = False while not done: action, _ = self.predict(obs) obs, reward, done, info = self.env.step(action) journal = self.env.sim.journal longs = len([x for x in journal if x['Type'] == 'LONG']) shorts = len([x for x in journal if x['Type'] == 'SHORT']) test_episode_rewards = sum([j['Profit'] for j in journal]) test_ave_profit_per_trade = test_episode_rewards / self.env.sim.total_trades if self.env.sim.total_trades > 0 else -np.inf return test_episode_rewards, longs, shorts, test_ave_profit_per_trade def save(self, save_path): data = { 'batch_size': self.batch_size, 'learning_starts': self.learning_starts, 'learning_rate': self.learning_rate, 'target_network_update_freq': self.target_network_update_freq, 'exploration_final_eps': self.exploration_final_eps, 'exploration_fraction': self.exploration_fraction, 'gamma': self.gamma, 'policy': self.policy, 'journal': self.env.sim.journal, 'reward_curve': self.reward_curve } params = self.sess.run(self.params) self._save_to_file(save_path, data=data, params=params)
class TD3(OffPolicyRLModel): """ Twin Delayed DDPG (TD3) Addressing Function Approximation Error in Actor-Critic Methods. Original implementation: https://github.com/sfujim/TD3 Paper: https://arxiv.org/pdf/1802.09477.pdf Introduction to TD3: https://spinningup.openai.com/en/latest/algorithms/td3.html :param policy: (TD3Policy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) the discount factor :param learning_rate: (float or callable) learning rate for adam optimizer, the same learning rate will be used for all networks (Q-Values and Actor networks) it can be a function of the current progress (from 1 to 0) :param buffer_size: (int) size of the replay buffer :param batch_size: (int) Minibatch size for each gradient update :param tau: (float) the soft update coefficient ("polyak update" of the target networks, between 0 and 1) :param policy_delay: (int) Policy and target networks will only be updated once every policy_delay steps per training steps. The Q values will be updated policy_delay more often (update every training step). :param action_noise: (ActionNoise) the action noise type. Cf DDPG for the different action noise type. :param target_policy_noise: (float) Standard deviation of gaussian noise added to target policy (smoothing noise) :param target_noise_clip: (float) Limit for absolute value of target policy smoothing noise. :param train_freq: (int) Update the model every `train_freq` steps. :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param gradient_steps: (int) How many gradient update after each step :param random_exploration: (float) Probability of taking a random action (as in an epsilon-greedy strategy) This is not needed for TD3 normally but can help exploring when using HER + TD3. This hack was present in the original OpenAI Baselines repo (DDPG + HER) :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation :param full_tensorboard_log: (bool) enable additional logging when using tensorboard Note: this has no effect on TD3 logging for now :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). If None (default), use random seed. Note that if you want completely deterministic results, you must set `n_cpu_tf_sess` to 1. :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations If None, the number of cpu of the current machine will be used. """ def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=50000, learning_starts=100, train_freq=100, gradient_steps=100, batch_size=128, tau=0.005, policy_delay=2, action_noise=None, target_policy_noise=0.2, target_noise_clip=0.5, random_exploration=0.0, verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None): super(TD3, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=TD3Policy, requires_vec_env=False, policy_kwargs=policy_kwargs, seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.buffer_size = buffer_size self.learning_rate = learning_rate self.learning_starts = learning_starts self.train_freq = train_freq self.batch_size = batch_size self.tau = tau self.gradient_steps = gradient_steps self.gamma = gamma self.action_noise = action_noise self.random_exploration = random_exploration self.policy_delay = policy_delay self.target_noise_clip = target_noise_clip self.target_policy_noise = target_policy_noise self.graph = None self.replay_buffer = None self.episode_reward = None self.sess = None self.tensorboard_log = tensorboard_log self.verbose = verbose self.params = None self.summary = None self.policy_tf = None self.full_tensorboard_log = full_tensorboard_log self.obs_target = None self.target_policy_tf = None self.actions_ph = None self.rewards_ph = None self.terminals_ph = None self.observations_ph = None self.action_target = None self.next_observations_ph = None self.step_ops = None self.target_ops = None self.infos_names = None self.target_params = None self.learning_rate_ph = None self.processed_obs_ph = None self.processed_next_obs_ph = None self.policy_out = None self.policy_train_op = None self.policy_loss = None if _init_setup_model: self.setup_model() def _get_pretrain_placeholders(self): policy = self.policy_tf # Rescale policy_out = unscale_action(self.action_space, self.policy_out) return policy.obs_ph, self.actions_ph, policy_out def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy_tf = self.policy( self.sess, self.observation_space, self.action_space, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy_tf.obs_ph self.processed_next_obs_ph = self.target_policy_tf.processed_obs self.action_target = self.target_policy_tf.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None, ) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") with tf.variable_scope("model", reuse=False): # Create the policy self.policy_out = policy_out = self.policy_tf.make_actor( self.processed_obs_ph) # Use two Q-functions to improve performance by reducing overestimation bias qf1, qf2 = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph) # Q value when following the current policy qf1_pi, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, reuse=True) with tf.variable_scope("target", reuse=False): # Create target networks target_policy_out = self.target_policy_tf.make_actor( self.processed_next_obs_ph) # Target policy smoothing, by adding clipped noise to target actions target_noise = tf.random_normal( tf.shape(target_policy_out), stddev=self.target_policy_noise) target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip) # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh) noisy_target_action = tf.clip_by_value( target_policy_out + target_noise, -1, 1) # Q values when following the target policy qf1_target, qf2_target = self.target_policy_tf.make_critics( self.processed_next_obs_ph, noisy_target_action) with tf.variable_scope("loss", reuse=False): # Take the min of the two target Q-Values (clipped Double-Q Learning) min_qf_target = tf.minimum(qf1_target, qf2_target) # Targets for Q value regression q_backup = tf.stop_gradient(self.rewards_ph + (1 - self.terminals_ph) * self.gamma * min_qf_target) # Compute Q-Function loss qf1_loss = tf.reduce_mean((q_backup - qf1)**2) qf2_loss = tf.reduce_mean((q_backup - qf2)**2) qvalues_losses = qf1_loss + qf2_loss # Policy loss: maximise q value self.policy_loss = policy_loss = -tf.reduce_mean(qf1_pi) # Policy train op # will be called only every n training steps, # where n is the policy delay policy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize( policy_loss, var_list=get_vars('model/pi')) self.policy_train_op = policy_train_op # Q Values optimizer qvalues_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) qvalues_params = get_vars('model/values_fn/') # Q Values and policy target params source_params = get_vars("model/") target_params = get_vars("target/") # Polyak averaging for target variables self.target_ops = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] train_values_op = qvalues_optimizer.minimize( qvalues_losses, var_list=qvalues_params) self.infos_names = ['qf1_loss', 'qf2_loss'] # All ops to call during one training step self.step_ops = [ qf1_loss, qf2_loss, qf1, qf2, train_values_op ] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = get_vars("model") self.target_params = get_vars("target/") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all() def _train_step(self, step, writer, learning_rate, update_policy): # Sample a batch from the replay buffer batch = self.replay_buffer.sample(self.batch_size) batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = batch feed_dict = { self.observations_ph: batch_obs, self.actions_ph: batch_actions, self.next_observations_ph: batch_next_obs, self.rewards_ph: batch_rewards.reshape(self.batch_size, -1), self.terminals_ph: batch_dones.reshape(self.batch_size, -1), self.learning_rate_ph: learning_rate } step_ops = self.step_ops if update_policy: # Update policy and target networks step_ops = step_ops + [ self.policy_train_op, self.target_ops, self.policy_loss ] # Do one gradient step # and optionally compute log for tensorboard if writer is not None: out = self.sess.run([self.summary] + step_ops, feed_dict) summary = out.pop(0) writer.add_summary(summary, step) else: out = self.sess.run(step_ops, feed_dict) # Unpack to monitor losses qf1_loss, qf2_loss, *_values = out return qf1_loss, qf2_loss def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="TD3", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if self.num_timesteps < self.learning_starts or np.random.rand( ) < self.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.action_space, unscaled_action) else: action = self.policy_tf.step(obs[None]).flatten() # Add noise to the action, as the policy # is deterministic, this is required for exploration if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(unscaled_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) # Note: the policy is updated less frequently than the Q functions # this is controlled by the `policy_delay` parameter mb_infos_vals.append( self._train_step(step, writer, current_lr, (step + grad_step) % self.policy_delay == 0)) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done: if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): _ = np.array(observation) if actions is not None: raise ValueError("Error: TD3 does not have action probabilities.") # here there are no action probabilities, as DDPG does not use a probability distribution warnings.warn( "Warning: action probability is meaningless for TD3. Returning None" ) return None def predict(self, observation, state=None, mask=None, deterministic=True): observation = np.array(observation) vectorized_env = self._is_vectorized_observation( observation, self.observation_space) observation = observation.reshape((-1, ) + self.observation_space.shape) actions = self.policy_tf.step(observation) if self.action_noise is not None and not deterministic: actions = np.clip(actions + self.action_noise(), -1, 1) actions = actions.reshape( (-1, ) + self.action_space.shape) # reshape to the correct action shape actions = unscale_action( self.action_space, actions) # scale the output for the prediction if not vectorized_env: actions = actions[0] return actions, None def get_parameter_list(self): return (self.params + self.target_params) def save(self, save_path, cloudpickle=False): data = { "learning_rate": self.learning_rate, "buffer_size": self.buffer_size, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "batch_size": self.batch_size, "tau": self.tau, # Should we also store the replay buffer? # this may lead to high memory usage # with all transition inside # "replay_buffer": self.replay_buffer "policy_delay": self.policy_delay, "target_noise_clip": self.target_noise_clip, "target_policy_noise": self.target_policy_noise, "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "n_cpu_tf_sess": self.n_cpu_tf_sess, "seed": self.seed, "action_noise": self.action_noise, "random_exploration": self.random_exploration, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs } params_to_save = self.get_parameters() self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle)
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_successes = [] obs = self.env.reset() reset = True ############################################################ # MODIFICATION: # Track list of actions taken each episode. This is # intentionally not a set so that we can use np.isin. action_list = list() ############################################################ for _ in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): #################################################### # MODIFICATION: # Rename variable from original, since it's now # going to come back as an array due to the # modified build_act function being used to # construct everything. action_arr = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] #################################################### # ORIGINAL: # action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] ######################################################## # MODIFICATION: # Get the best action that has not yet been taken this # episode. action = \ action_arr[np.argmin(np.isin(action_arr, action_list))] # Add this action to the list. action_list.append(action) ######################################################## env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += rew if done: #################################################### # MODIFICATION: # Clear the list. action_list.clear() #################################################### maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities( batch_idxes, new_priorities) if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() self.num_timesteps += 1 return self
class OurDDPG(OffPolicyRLModel): def __init__(self, policy, env, seed=0, eval_env=None, eval_freq=5000, gamma=0.99, tau=0.005, action_noise=None, normalize_observations=False, normalize_returns=False, observation_range=(-np.inf, np.inf), return_range=(-np.inf, np.inf), reward_scale=1., critic_l2_reg=0., clip_norm=None, actor_lr=1e-3, critic_lr=1e-3, buffer_size=1e6, batch_size=128, verbose=0, policy_kwargs=None, tensorboard_log=None, full_tensorboard_log=False, _init_setup_model=True, ro=True, sample_number=128, adjust_lr=False): super(OurDDPG, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=DDPGPolicy, requires_vec_env=False, policy_kwargs=policy_kwargs) # Parameters. self.seed = seed self.gamma = gamma self.tau = tau self.ro = ro self.sample_number = sample_number self.eval_freq = eval_freq self.adjust_lr = adjust_lr self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.return_range = return_range self.observation_range = observation_range self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.reward_scale = reward_scale self.batch_size = batch_size self.critic_l2_reg = critic_l2_reg self.eval_env = eval_env self.buffer_size = buffer_size self.tensorboard_log = tensorboard_log self.full_tensorboard_log = full_tensorboard_log # init self.graph = None self.stats_sample = None self.replay_buffer = None self.policy_tf = None self.target_init_updates = None self.target_soft_updates = None self.critic_loss = None self.critic_optimizer = None self.critic_optimize_op = None self.sess = None self.stats_ops = None self.stats_names = None self.perturbed_actor_tf = None self.perturb_policy_ops = None self.perturb_adaptive_policy_ops = None self.adaptive_policy_distance = None self.actor_loss = None self.actor_optimizer = None self.actor_optimize_op = None self.old_std = None self.old_mean = None self.renormalize_q_outputs_op = None self.obs_rms = None self.ret_rms = None self.target_policy = None self.actor_tf = None self.critic_tf = None self.critic_with_actor_tf = None self.critic_with_actor_tf = None self.target_q = None self.obs_train_ph = None self.action_train_ph = None self.obs_target = None self.action_target = None self.obs_noise = None self.action_noise_ph = None self.obs_adapt_noise = None self.action_adapt_noise = None self.terminals1 = None self.rewards = None self.critic_target = None self.param_noise_stddev = None self.param_noise_actor = None self.adaptive_param_noise_actor = None self.params = None self.summary = None self.episode_reward = None self.tb_seen_steps = None self.target_params = None self.obs_rms_params = None self.ret_rms_params = None # Randomized Optimization self.augmented_obs0 = None self.augmented_action_raw = None self.augmented_action = None self.augmented_critic_with_actor_tf = None self.reward_summary = None self.actor_loss_summary = None self.critic_loss_summary = None self.obs_summary = None if _init_setup_model: self.setup_model() def _get_pretrain_placeholders(self): return NotImplementedError # policy = self.policy_tf # # Rescale # deterministic_action = self.actor_tf * np.abs(self.action_space.low) # return policy.obs_ph, self.actions, deterministic_action def setup_model(self): with SetVerbosity(self.verbose): assert isinstance(self.action_space, gym.spaces.Box), \ "Error: DDPG cannot output a {} action space, only spaces.Box is supported.".format(self.action_space) assert issubclass(self.policy, DDPGPolicy), "Error: the input policy for the DDPG model must be " \ "an instance of DDPGPolicy." self.graph = tf.Graph() with self.graph.as_default(): self._setup_learn(self.seed) # self.sess = tf_util.single_threaded_session(graph=self.graph) self.sess = tf_util.make_session() self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Observation normalization. # if self.normalize_observations: # with tf.variable_scope('obs_rms'): # self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) # else: # self.obs_rms = None # Return normalization. # if self.normalize_returns: # with tf.variable_scope('ret_rms'): # self.ret_rms = RunningMeanStd() # else: # self.ret_rms = None self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, None, **self.policy_kwargs) # Create target networks. self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, None, **self.policy_kwargs) self.obs_target = self.target_policy.obs_ph self.action_target = self.target_policy.action_ph # normalized_obs0 = tf.clip_by_value(normalize(self.policy_tf.processed_obs, self.obs_rms), # self.observation_range[0], self.observation_range[1]) # normalized_obs1 = tf.clip_by_value(normalize(self.target_policy.processed_obs, self.obs_rms), # self.observation_range[0], self.observation_range[1]) # Inputs. self.obs_train_ph = self.policy_tf.obs_ph self.action_train_ph = self.policy_tf.action_ph self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') # Create networks and core TF parts that are shared across setup parts. with tf.variable_scope("model", reuse=False): self.actor_tf = self.policy_tf.make_actor( self.policy_tf.processed_obs) self.critic_tf = self.policy_tf.make_critic( self.policy_tf.processed_obs, self.action_train_ph) self.critic_with_actor_tf = self.policy_tf.make_critic( self.policy_tf.processed_obs, self.actor_tf, reuse=True) if self.ro: def tf_repeat(tensor_to_repeat, repeat_num): tiled = tf.tile(tensor_to_repeat, [1, repeat_num]) repeated = tf.reshape( tiled, shape=[ self.batch_size * repeat_num, tensor_to_repeat.shape[1] ]) return repeated self.augmented_obs0 = tf_repeat( self.policy_tf.processed_obs, self.sample_number) self.augmented_action_raw = tf_repeat( self.actor_tf, self.sample_number) noises = [] for b_index in range(self.batch_size): noises.append( tf.random_uniform((self.sample_number - 1, ) + self.action_space.shape, -0.1, 0.1)) noises.append( tf.zeros((1, ) + self.action_space.shape)) noises = tf.concat(noises, axis=0) self.augmented_action = self.augmented_action_raw + noises self.augmented_action = tf.clip_by_value( self.augmented_action, -1, 1) self.augmented_critic_with_actor_tf = self.policy_tf.make_critic( self.augmented_obs0, self.augmented_action, reuse=True)[:, 0] with tf.variable_scope("target", reuse=False): critic_target = \ self.target_policy.make_critic(self.target_policy.processed_obs, self.target_policy.make_actor(self.target_policy.processed_obs)) with tf.variable_scope("loss", reuse=False): # self.critic_tf = denormalize( # tf.clip_by_value(self.critic_tf, self.return_range[0], self.return_range[1]), # self.ret_rms) # # self.critic_with_actor_tf = denormalize( # tf.clip_by_value(self.critic_with_actor_tf, # self.return_range[0], self.return_range[1]), # self.ret_rms) # # q_obs1 = denormalize(critic_target, self.ret_rms) self.target_q = self.rewards + ( 1. - self.terminals1) * self.gamma * critic_target # tf.summary.scalar('critic_target', tf.reduce_mean(self.critic_target)) if self.full_tensorboard_log: tf.summary.histogram('critic_target', self.critic_target) # Set up parts. self._setup_stats() self._setup_target_network_updates() with tf.variable_scope("input_info", reuse=False): self.reward_summary = tf.summary.scalar( 'rewards', tf.reduce_mean(self.rewards)) self.obs_summary = tf.summary.scalar( 'obs', tf.reduce_mean(self.obs_train_ph)) if self.full_tensorboard_log: tf.summary.histogram('rewards', self.rewards) if len(self.observation_space.shape ) == 3 and self.observation_space.shape[0] in [ 1, 3, 4 ]: tf.summary.image('observation', self.obs_train_ph) else: tf.summary.histogram('observation', self.obs_train_ph) with tf.variable_scope("Adam_mpi", reuse=False): self._setup_actor_optimizer() self._setup_critic_optimizer() self.actor_loss_summary = tf.summary.scalar( 'actor_loss', self.actor_loss) self.critic_loss_summary = tf.summary.scalar( 'critic_loss', self.critic_loss) self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars("target") self.obs_rms_params = [ var for var in tf.global_variables() if "obs_rms" in var.name ] self.ret_rms_params = [ var for var in tf.global_variables() if "ret_rms" in var.name ] with self.sess.as_default(): self._initialize(self.sess) # self.summary = tf.summary.merge_all() def _setup_target_network_updates(self): """ set the target update operations """ init_updates, soft_updates = get_target_updates( tf_util.get_trainable_vars('model/'), tf_util.get_trainable_vars('target/'), self.tau, self.verbose) self.target_init_updates = init_updates self.target_soft_updates = soft_updates def _setup_actor_optimizer(self): """ setup the optimizer for the actor """ if self.verbose >= 2: logger.info('setting up actor optimizer') if self.ro: split_group_action_raw = tf.split(self.augmented_action_raw, self.batch_size, axis=0) split_group_action = tf.split(self.augmented_action, self.batch_size, axis=0) split_group_q = tf.split(self.augmented_critic_with_actor_tf, self.batch_size, axis=0) self.actor_loss = 0 q_stds = [] for idx in range(self.batch_size): # softmax = tf.nn.softmax(split_group_q[idx] - # tf.reduce_max(split_group_q[idx], axis=0, keepdims=True), axis=0) # self.actor_loss = self.actor_loss + tf.reduce_sum( # tf.reduce_sum(tf.square(split_group_action_raw[idx] - # tf.stop_gradient(split_group_action[idx])), # axis=1) # * tf.stop_gradient(softmax)) max_index = tf.argmax(split_group_q[idx], axis=0) q_std = tf.math.reduce_std(split_group_q[idx]) * 20 target_action = split_group_action[idx][max_index, :] if self.adjust_lr: self.actor_loss = self.actor_loss + \ tf.reduce_mean(tf.square(self.actor_tf[idx, :] - tf.stop_gradient(target_action))) \ / tf.stop_gradient(q_std) else: self.actor_loss = self.actor_loss + \ tf.reduce_mean(tf.square(self.actor_tf[idx, :] - tf.stop_gradient(target_action))) q_stds.append(q_std) # tf.summary.histogram("q_std", tf.stack(q_stds, axis=0)) else: self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [ var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/pi/') ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) if self.verbose >= 2: logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) # self.actor_grads = tf_util.flatgrad(self.actor_loss, tf_util.get_trainable_vars('model/pi/'), # clip_norm=self.clip_norm) # self.actor_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/pi/'), beta1=0.9, beta2=0.999, # epsilon=1e-08) self.actor_optimizer = tf.train.AdamOptimizer( learning_rate=self.actor_lr) self.actor_gradients = self.actor_optimizer.compute_gradients( self.actor_loss, var_list=tf_util.get_trainable_vars("model/pi/")) hist_summary = [] for gradient, variable in self.actor_gradients: if gradient is not None: hist_summary.append( tf.summary.histogram("gradients/" + variable.name, gradient)) hist_summary.append( tf.summary.histogram("variables/" + variable.name, variable)) self.actor_gradient_summary = tf.summary.merge(hist_summary) self.actor_optimize_op = self.actor_optimizer.apply_gradients( self.actor_gradients) # self.actor_optimize_op = self.actor_optimizer.minimize(self.actor_loss, # var_list=tf_util.get_trainable_vars("model/pi/")) def _setup_critic_optimizer(self): """ setup the optimizer for the critic """ if self.verbose >= 2: logger.info('setting up critic optimizer') # normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), # self.return_range[0], self.return_range[1]) # self.critic_loss = tf.reduce_mean(tf.square(self.critic_tf - normalized_critic_target_tf)) self.critic_loss = tf.reduce_mean( tf.square(self.critic_tf - self.critic_target)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in tf_util.get_trainable_vars('model/qf/') if 'bias' not in var.name and 'qf_output' not in var.name and 'b' not in var.name ] if self.verbose >= 2: for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/qf/') ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) if self.verbose >= 2: logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) # self.critic_grads = tf_util.flatgrad(self.critic_loss, tf_util.get_trainable_vars('model/qf/'), # clip_norm=self.clip_norm) # self.critic_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/qf/'), beta1=0.9, beta2=0.999, # epsilon=1e-08) self.critic_optimizer = tf.train.AdamOptimizer( learning_rate=self.critic_lr) self.critic_optimize_op = self.critic_optimizer.minimize( self.critic_loss, var_list=tf_util.get_trainable_vars("model/qf/")) def _setup_stats(self): """ setup the running means and std of the inputs and outputs of the model """ ops = [] names = [] # if self.normalize_returns: # ops += [self.ret_rms.mean, self.ret_rms.std] # names += ['ret_rms_mean', 'ret_rms_std'] # # if self.normalize_observations: # ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)] # names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] self.stats_ops = ops self.stats_names = names def _policy(self, obs, apply_noise=True, compute_q=True): """ Get the actions and critic output, from a given observation :param obs: ([float] or [int]) the observation :param apply_noise: (bool) enable the noise :param compute_q: (bool) compute the critic output :return: ([float], float) the action and critic value """ obs = np.array(obs).reshape((-1, ) + self.observation_space.shape) feed_dict = {self.obs_train_ph: obs} actor_tf = self.actor_tf if compute_q: action, q_value = self.sess.run( [actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q_value = None action = action.flatten() if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, -1, 1) return action, q_value def _store_transition(self, obs0, action, reward, obs1, terminal1): """ Store a transition in the replay buffer :param obs0: ([float] or [int]) the last observation :param action: ([float]) the action :param reward: (float] the reward :param obs1: ([float] or [int]) the current observation :param terminal1: (bool) Whether the episode is over """ reward *= self.reward_scale self.replay_buffer.add(obs0, action, reward, obs1, float(terminal1)) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def _train_step(self, step, writer, do_actor_update): """ run a step of training from batch :param step: (int) the current step iteration :param writer: (TensorFlow Summary.writer) the writer for tensorboard :param log: (bool) whether or not to log to metadata :return: (float, float) critic loss, actor loss """ # Get a batch obs0, actions, rewards, obs1, terminals1 = self.replay_buffer.sample( batch_size=self.batch_size) # Reshape to match previous behavior and placeholder shape rewards = rewards.reshape(-1, 1) terminals1 = terminals1.reshape(-1, 1) target_q = self.sess.run(self.target_q, feed_dict={ self.obs_target: obs1, self.rewards: rewards, self.terminals1: terminals1 }) # Get all gradients and perform a synced update. # ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss] td_map = { self.obs_train_ph: obs0, # self.actions: actions, self.action_train_ph: actions, self.rewards: rewards, self.critic_target: target_q, } critic_loss_summary, reward_summary, obs_summary, critic_loss, _ = \ self.sess.run([self.critic_loss_summary, self.reward_summary, self.obs_summary, self.critic_loss, self.critic_optimize_op], td_map) # self.critic_optimizer.update(critic_grads, learning_rate=self.critic_lr) writer.add_summary(critic_loss_summary, step) writer.add_summary(reward_summary, step) writer.add_summary(obs_summary, step) actor_loss = None if do_actor_update: actor_loss_summary, actor_gradient_summary, actor_loss, _ = \ self.sess.run([self.actor_loss_summary, self.actor_gradient_summary, self.actor_loss, self.actor_optimize_op], td_map) # self.actor_optimizer.update(actor_grads, learning_rate=self.actor_lr) writer.add_summary(actor_gradient_summary, step) writer.add_summary(actor_loss_summary, step) return critic_loss, actor_loss def _initialize(self, sess): """ initialize the model parameters and optimizers :param sess: (TensorFlow Session) the current TensorFlow session """ self.sess = sess self.sess.run(tf.global_variables_initializer()) # self.actor_optimizer.sync() # self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def _update_target_net(self): """ run target soft update operation """ self.sess.run(self.target_soft_updates) def _get_stats(self): """ Get the mean and standard deviation of the model's inputs and outputs :return: (dict) the means and stds """ if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. obs0, actions, rewards, obs1, terminals1 = self.replay_buffer.sample( batch_size=self.batch_size) self.stats_sample = { 'obs0': obs0, 'actions': actions, 'rewards': rewards, 'obs1': obs1, 'terminals1': terminals1 } # feed_dict = { # self.actions: self.stats_sample['actions'] # } feed_dict = {} for placeholder in [self.action_train_ph, self.action_target]: if placeholder is not None: feed_dict[placeholder] = self.stats_sample['actions'] for placeholder in [self.obs_train_ph, self.obs_target]: if placeholder is not None: feed_dict[placeholder] = self.stats_sample['obs0'] values = self.sess.run(self.stats_ops, feed_dict=feed_dict) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) return stats def _reset(self): """ Reset internal state after an episode is complete. """ if self.action_noise is not None: self.action_noise.reset() def learn(self, total_timesteps, callback=None, seed=None, log_interval=None, tb_log_name="DDPG", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: # a list for tensorboard logging, to prevent logging with the same step number, if it already occured self.tb_seen_steps = [] # rank = MPI.COMM_WORLD.Get_rank() # we assume symmetric actions. assert np.all( np.abs(self.env.action_space.low) == self.env.action_space.high) if self.verbose >= 2: logger.log('Using agent with the following configuration:') logger.log(str(self.__dict__.items())) with self.sess.as_default(), self.graph.as_default(): # Prepare everything. self._reset() obs = self.env.reset() eval_obs = None if self.eval_env is not None: eval_obs = self.eval_env.reset() episode_rewards_deque = deque(maxlen=100) eval_episode_rewards_deque = deque(maxlen=100) self.episode_reward = np.zeros((1, )) episode_successes = [] episode_rewards_all = [] episode_steps_all = [] episode_reward = 0. episode_step = 0 total_steps = 0 step_since_eval = 0 total_episode_num = 0 start_time = time.time() while True: # Perform rollouts. qs_this_rollout_period = [] actions_this_rollout_period = [] while True: if total_steps >= total_timesteps: return self # Predict next action. if total_steps <= 10000: action = self.env.action_space.sample() q_value = 0 else: action, q_value = self._policy(obs, apply_noise=True, compute_q=True) assert action.shape == self.env.action_space.shape rescaled_action = action * np.abs( self.action_space.low) new_obs, reward, done, info = self.env.step( rescaled_action) if writer is not None: ep_rew = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) total_steps += 1 self.num_timesteps += 1 episode_reward += reward episode_step += 1 step_since_eval += 1 # Book-keeping. actions_this_rollout_period.append(action) qs_this_rollout_period.append(q_value) self._store_transition(obs, action, reward, new_obs, done) obs = new_obs if done: # Episode done. episode_rewards_all.append(episode_reward) episode_rewards_deque.append(episode_reward) episode_steps_all.append(episode_step) episode_reward = 0. episode_step = 0 total_episode_num += 1 maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append( float(maybe_is_success)) self._reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() break # Train. actor_losses_this_train_period = [] critic_losses_this_train_period = [] last_episode_step = int(episode_steps_all[-1]) for t_train in range(last_episode_step): # Not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size): break # weird equation to deal with the fact the nb_train_steps will be different # to nb_rollout_steps step = total_steps - last_episode_step + t_train critic_loss, actor_loss = self._train_step( step, writer, do_actor_update=t_train % 2 == 0) critic_losses_this_train_period.append(critic_loss) if actor_loss: actor_losses_this_train_period.append(actor_loss) self._update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if self.eval_env is not None and step_since_eval >= self.eval_freq: step_since_eval %= self.eval_freq eval_episode_reward = 0. eval_episode = 0 while eval_episode < 10: eval_action, eval_q = self._policy( eval_obs, apply_noise=False, compute_q=True) eval_obs, eval_r, eval_done, _ = self.eval_env.step( eval_action * np.abs(self.action_space.low)) eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: if not isinstance(self.env, VecEnv): eval_obs = self.eval_env.reset() eval_episode_rewards.append( eval_episode_reward) eval_episode_rewards_deque.append( eval_episode_reward) eval_episode_reward = 0. eval_episode += 1 if callback is not None: # Only stop training if return value is False, not when it is None. # This is for backwards compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: return self # mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = self._get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = episode_rewards_all[-1] combined_stats['rollout/return_last_100'] = np.mean( episode_rewards_deque) combined_stats[ 'rollout/episode_steps'] = episode_steps_all[-1] combined_stats['debug/actions_mean'] = np.mean( actions_this_rollout_period) combined_stats['debug/actions_std'] = np.std( actions_this_rollout_period) combined_stats['debug/Q_mean'] = np.mean( qs_this_rollout_period) combined_stats['train/loss_actor'] = np.mean( actor_losses_this_train_period) combined_stats['train/loss_critic'] = np.mean( critic_losses_this_train_period) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float( total_steps) / float(duration) # Evaluation statistics. if self.eval_env is not None and eval_episode_rewards: combined_stats['eval/return'] = np.mean( eval_episode_rewards) combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_deque) combined_stats['eval/Q'] = np.mean(eval_qs) combined_stats['eval/episodes'] = len( eval_episode_rewards) def as_scalar(scalar): """ check and return the input if it is a scalar, otherwise raise ValueError :param scalar: (Any) the object to check :return: (Number) the scalar if x is a scalar """ if isinstance(scalar, np.ndarray): assert scalar.size == 1 return scalar[0] elif np.isscalar(scalar): return scalar else: raise ValueError('expected scalar, got %s' % scalar) # combined_stats_sums = MPI.COMM_WORLD.allreduce( # np.array([as_scalar(x) for x in combined_stats.values()])) # combined_stats = {k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/episodes'] = total_episode_num combined_stats['total/steps'] = total_steps for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.dump_tabular() logger.info('') logdir = logger.get_dir() # if rank == 0 and logdir: # if hasattr(self.env, 'get_state'): # with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as file_handler: # pickle.dump(self.env.get_state(), file_handler) # if self.eval_env and hasattr(self.eval_env, 'get_state'): # with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as file_handler: # pickle.dump(self.eval_env.get_state(), file_handler) def predict(self, observation, state=None, mask=None, deterministic=True): observation = np.array(observation) vectorized_env = self._is_vectorized_observation( observation, self.observation_space) observation = observation.reshape((-1, ) + self.observation_space.shape) actions, _, = self._policy(observation, apply_noise=not deterministic, compute_q=False) actions = actions.reshape( (-1, ) + self.action_space.shape) # reshape to the correct action shape actions = actions * np.abs( self.action_space.low) # scale the output for the prediction if not vectorized_env: actions = actions[0] return actions, None def action_probability(self, observation, state=None, mask=None, actions=None): observation = np.array(observation) if actions is not None: raise ValueError("Error: DDPG does not have action probabilities.") # here there are no action probabilities, as DDPG does not use a probability distribution warnings.warn( "Warning: action probability is meaningless for DDPG. Returning None" ) return None def get_parameter_list(self): return (self.params + self.target_params + self.obs_rms_params + self.ret_rms_params) def save(self, save_path): data = { "ro": self.ro, "seed": self.seed, "sample_number": self.sample_number, "eval_freq": self.eval_freq, "adjust_lr": self.adjust_lr, "observation_space": self.observation_space, "action_space": self.action_space, "verbose": self.verbose, "action_noise": self.action_noise, "gamma": self.gamma, "tau": self.tau, "normalize_returns": self.normalize_returns, "normalize_observations": self.normalize_observations, "batch_size": self.batch_size, "observation_range": self.observation_range, "return_range": self.return_range, "critic_l2_reg": self.critic_l2_reg, "actor_lr": self.actor_lr, "critic_lr": self.critic_lr, "clip_norm": self.clip_norm, "reward_scale": self.reward_scale, "buffer_size": self.buffer_size, "policy": self.policy, "n_envs": self.n_envs, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs } params_to_save = self.get_parameters() self._save_to_file(save_path, data=data, params=params_to_save) @classmethod def load(cls, load_path, env=None, **kwargs): data, params = cls._load_from_file(load_path) if 'policy_kwargs' in kwargs and kwargs['policy_kwargs'] != data[ 'policy_kwargs']: raise ValueError( "The specified policy kwargs do not equal the stored policy kwargs. " "Stored kwargs: {}, specified kwargs: {}".format( data['policy_kwargs'], kwargs['policy_kwargs'])) model = cls(None, env, _init_setup_model=False) model.__dict__.update(data) model.__dict__.update(kwargs) model.set_env(env) model.setup_model() # Patch for version < v2.6.0, duplicated keys where saved if len(params) > len(model.get_parameter_list()): n_params = len(model.params) n_target_params = len(model.target_params) n_normalisation_params = len(model.obs_rms_params) + len( model.ret_rms_params) # Check that the issue is the one from # https://github.com/hill-a/stable-baselines/issues/363 assert len(params) == 2 * (n_params + n_target_params) + n_normalisation_params,\ "The number of parameter saved differs from the number of parameters"\ " that should be loaded: {}!={}".format(len(params), len(model.get_parameter_list())) # Remove duplicates params_ = params[:n_params + n_target_params] if n_normalisation_params > 0: params_ += params[-n_normalisation_params:] params = params_ model.load_parameters(params) return model
class DQN(OffPolicyRLModel): """ The DQN model class. DQN paper: https://arxiv.org/abs/1312.5602 Dueling DQN: https://arxiv.org/abs/1511.06581 Double-Q Learning: https://arxiv.org/abs/1509.06461 Prioritized Experience Replay: https://arxiv.org/abs/1511.05952 :param policy: (DQNPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) discount factor :param learning_rate: (float) learning rate for adam optimizer :param buffer_size: (int) size of the replay buffer :param exploration_fraction: (float) fraction of entire training period over which the exploration rate is annealed :param exploration_final_eps: (float) final value of random action probability :param exploration_initial_eps: (float) initial value of random action probability :param train_freq: (int) update the model every `train_freq` steps. set to None to disable printing :param batch_size: (int) size of a batched sampled from replay buffer for training :param double_q: (bool) Whether to enable Double-Q learning or not. :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param target_network_update_freq: (int) update the target network every `target_network_update_freq` steps. :param prioritized_replay: (bool) if True prioritized replay buffer will be used. :param prioritized_replay_alpha: (float)alpha parameter for prioritized replay buffer. It determines how much prioritization is used, with alpha=0 corresponding to the uniform case. :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities. :param param_noise: (bool) Whether or not to apply noise to the parameters of the policy. :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance :param full_tensorboard_log: (bool) enable additional logging when using tensorboard WARNING: this logging can take a lot of space quickly :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). If None (default), use random seed. Note that if you want completely deterministic results, you must set `n_cpu_tf_sess` to 1. :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations If None, the number of cpu of the current machine will be used. """ def __init__(self, policy, env, gamma=0.99, learning_rate=5e-4, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, exploration_initial_eps=1.0, train_freq=1, batch_size=32, double_q=True, learning_starts=1000, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, n_cpu_tf_sess=None, verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, use_rmsprop=False, rmsprop_alpha=0.95, rmsprop_epsilon=0.01, exploration_offset=0): # TODO: replay_buffer refactoring super(DQN, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=DQNPolicy, requires_vec_env=False, policy_kwargs=policy_kwargs, seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.param_noise = param_noise self.learning_starts = learning_starts self.train_freq = train_freq self.prioritized_replay = prioritized_replay self.prioritized_replay_eps = prioritized_replay_eps self.batch_size = batch_size self.target_network_update_freq = target_network_update_freq self.prioritized_replay_alpha = prioritized_replay_alpha self.prioritized_replay_beta0 = prioritized_replay_beta0 self.prioritized_replay_beta_iters = prioritized_replay_beta_iters self.exploration_final_eps = exploration_final_eps self.exploration_initial_eps = exploration_initial_eps self.exploration_fraction = exploration_fraction self.buffer_size = buffer_size self.learning_rate = learning_rate self.gamma = gamma self.tensorboard_log = tensorboard_log self.full_tensorboard_log = full_tensorboard_log self.double_q = double_q self.use_rmsprop = use_rmsprop self.rmsprop_alpha = rmsprop_alpha self.rmsprop_epsilon = rmsprop_epsilon self.exploration_offset = exploration_offset self.graph = None self.sess = None self._train_step = None self.step_model = None self.update_target = None self.act = None self.proba_step = None self.replay_buffer = None self.beta_schedule = None self.exploration = None self.params = None self.summary = None self.episode_reward = None if _init_setup_model: self.setup_model() def _get_pretrain_placeholders(self): policy = self.step_model return policy.obs_ph, tf.placeholder(tf.int32, [None]), policy.q_values def setup_model(self): with SetVerbosity(self.verbose): assert not isinstance(self.action_space, gym.spaces.Box), \ "Error: DQN cannot output a gym.spaces.Box action space." # If the policy is wrap in functool.partial (e.g. to disable dueling) # unwrap it to check the class type if isinstance(self.policy, partial): test_policy = self.policy.func else: test_policy = self.policy assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \ "an instance of DQNPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) if self.use_rmsprop: optimizer = tf.train.RMSPropOptimizer( learning_rate=self.learning_rate, decay=self.rmsprop_alpha, epsilon=self.rmsprop_epsilon, centered=True ) else: optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) self.act, self._train_step, self.update_target, self.step_model = build_train( q_func=partial(self.policy, **self.policy_kwargs), ob_space=self.observation_space, ac_space=self.action_space, optimizer=optimizer, gamma=self.gamma, grad_norm_clipping=10, param_noise=self.param_noise, sess=self.sess, full_tensorboard_log=self.full_tensorboard_log, double_q=self.double_q ) self.proba_step = self.step_model.proba_step self.params = tf_util.get_trainable_vars("deepq") # Initialize the parameters and copy them to the target network. tf_util.initialize(self.sess) self.update_target(sess=self.sess) self.summary = tf.summary.merge_all() def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_successes = [] obs = self.env.reset() reset = True self.episode_reward = np.zeros((1,)) for _ in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps - self.exploration_offset) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps - self.exploration_offset) + self.exploration.value(self.num_timesteps - self.exploration_offset) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += rew if done: maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample(self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs(td_errors) + self.prioritized_replay_eps assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities(batch_idxes, new_priorities) if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(self.num_timesteps - self.exploration_offset))) logger.dump_tabular() self.num_timesteps += 1 return episode_rewards def predict(self, observation, state=None, mask=None, deterministic=True): observation = np.array(observation) vectorized_env = self._is_vectorized_observation(observation, self.observation_space) observation = observation.reshape((-1,) + self.observation_space.shape) with self.sess.as_default(): actions, _, _ = self.step_model.step(observation, deterministic=deterministic) if not vectorized_env: actions = actions[0] return actions, None def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): observation = np.array(observation) vectorized_env = self._is_vectorized_observation(observation, self.observation_space) observation = observation.reshape((-1,) + self.observation_space.shape) actions_proba = self.proba_step(observation, state, mask) if actions is not None: # comparing the action distribution, to given actions actions = np.array([actions]) assert isinstance(self.action_space, gym.spaces.Discrete) actions = actions.reshape((-1,)) assert observation.shape[0] == actions.shape[0], "Error: batch sizes differ for actions and observations." actions_proba = actions_proba[np.arange(actions.shape[0]), actions] # normalize action proba shape actions_proba = actions_proba.reshape((-1, 1)) if logp: actions_proba = np.log(actions_proba) if not vectorized_env: if state is not None: raise ValueError("Error: The environment must be vectorized when using recurrent policies.") actions_proba = actions_proba[0] return actions_proba def get_parameter_list(self): return self.params def save(self, save_path, cloudpickle=False): # params data = { "double_q": self.double_q, "param_noise": self.param_noise, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "prioritized_replay": self.prioritized_replay, "prioritized_replay_eps": self.prioritized_replay_eps, "batch_size": self.batch_size, "target_network_update_freq": self.target_network_update_freq, "prioritized_replay_alpha": self.prioritized_replay_alpha, "prioritized_replay_beta0": self.prioritized_replay_beta0, "prioritized_replay_beta_iters": self.prioritized_replay_beta_iters, "exploration_final_eps": self.exploration_final_eps, "exploration_fraction": self.exploration_fraction, "learning_rate": self.learning_rate, "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "n_cpu_tf_sess": self.n_cpu_tf_sess, "seed": self.seed, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs } params_to_save = self.get_parameters() self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle)
def setup_model(self): with SetVerbosity(self.verbose): assert isinstance(self.action_space, gym.spaces.Box), \ "Error: DDPG cannot output a {} action space, only spaces.Box is supported.".format(self.action_space) assert issubclass(self.policy, DDPGPolicy), "Error: the input policy for the DDPG model must be " \ "an instance of DDPGPolicy." self.graph = tf.Graph() with self.graph.as_default(): self._setup_learn(self.seed) # self.sess = tf_util.single_threaded_session(graph=self.graph) self.sess = tf_util.make_session() self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Observation normalization. # if self.normalize_observations: # with tf.variable_scope('obs_rms'): # self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) # else: # self.obs_rms = None # Return normalization. # if self.normalize_returns: # with tf.variable_scope('ret_rms'): # self.ret_rms = RunningMeanStd() # else: # self.ret_rms = None self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, None, **self.policy_kwargs) # Create target networks. self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, None, **self.policy_kwargs) self.obs_target = self.target_policy.obs_ph self.action_target = self.target_policy.action_ph # normalized_obs0 = tf.clip_by_value(normalize(self.policy_tf.processed_obs, self.obs_rms), # self.observation_range[0], self.observation_range[1]) # normalized_obs1 = tf.clip_by_value(normalize(self.target_policy.processed_obs, self.obs_rms), # self.observation_range[0], self.observation_range[1]) # Inputs. self.obs_train_ph = self.policy_tf.obs_ph self.action_train_ph = self.policy_tf.action_ph self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') # Create networks and core TF parts that are shared across setup parts. with tf.variable_scope("model", reuse=False): self.actor_tf = self.policy_tf.make_actor( self.policy_tf.processed_obs) self.critic_tf = self.policy_tf.make_critic( self.policy_tf.processed_obs, self.action_train_ph) self.critic_with_actor_tf = self.policy_tf.make_critic( self.policy_tf.processed_obs, self.actor_tf, reuse=True) if self.ro: def tf_repeat(tensor_to_repeat, repeat_num): tiled = tf.tile(tensor_to_repeat, [1, repeat_num]) repeated = tf.reshape( tiled, shape=[ self.batch_size * repeat_num, tensor_to_repeat.shape[1] ]) return repeated self.augmented_obs0 = tf_repeat( self.policy_tf.processed_obs, self.sample_number) self.augmented_action_raw = tf_repeat( self.actor_tf, self.sample_number) noises = [] for b_index in range(self.batch_size): noises.append( tf.random_uniform((self.sample_number - 1, ) + self.action_space.shape, -0.1, 0.1)) noises.append( tf.zeros((1, ) + self.action_space.shape)) noises = tf.concat(noises, axis=0) self.augmented_action = self.augmented_action_raw + noises self.augmented_action = tf.clip_by_value( self.augmented_action, -1, 1) self.augmented_critic_with_actor_tf = self.policy_tf.make_critic( self.augmented_obs0, self.augmented_action, reuse=True)[:, 0] with tf.variable_scope("target", reuse=False): critic_target = \ self.target_policy.make_critic(self.target_policy.processed_obs, self.target_policy.make_actor(self.target_policy.processed_obs)) with tf.variable_scope("loss", reuse=False): # self.critic_tf = denormalize( # tf.clip_by_value(self.critic_tf, self.return_range[0], self.return_range[1]), # self.ret_rms) # # self.critic_with_actor_tf = denormalize( # tf.clip_by_value(self.critic_with_actor_tf, # self.return_range[0], self.return_range[1]), # self.ret_rms) # # q_obs1 = denormalize(critic_target, self.ret_rms) self.target_q = self.rewards + ( 1. - self.terminals1) * self.gamma * critic_target # tf.summary.scalar('critic_target', tf.reduce_mean(self.critic_target)) if self.full_tensorboard_log: tf.summary.histogram('critic_target', self.critic_target) # Set up parts. self._setup_stats() self._setup_target_network_updates() with tf.variable_scope("input_info", reuse=False): self.reward_summary = tf.summary.scalar( 'rewards', tf.reduce_mean(self.rewards)) self.obs_summary = tf.summary.scalar( 'obs', tf.reduce_mean(self.obs_train_ph)) if self.full_tensorboard_log: tf.summary.histogram('rewards', self.rewards) if len(self.observation_space.shape ) == 3 and self.observation_space.shape[0] in [ 1, 3, 4 ]: tf.summary.image('observation', self.obs_train_ph) else: tf.summary.histogram('observation', self.obs_train_ph) with tf.variable_scope("Adam_mpi", reuse=False): self._setup_actor_optimizer() self._setup_critic_optimizer() self.actor_loss_summary = tf.summary.scalar( 'actor_loss', self.actor_loss) self.critic_loss_summary = tf.summary.scalar( 'critic_loss', self.critic_loss) self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars("target") self.obs_rms_params = [ var for var in tf.global_variables() if "obs_rms" in var.name ] self.ret_rms_params = [ var for var in tf.global_variables() if "ret_rms" in var.name ] with self.sess.as_default(): self._initialize(self.sess)
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, initial_p=1.0): self.actions_weights = [] self.actions_container = [] new_tb_log = self._init_num_timesteps(reset_num_timesteps) cnt = 0 ds_rewards = [[0, 0]] with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=initial_p, final_p=self.exploration_final_eps) episode_rewards = [0.0] obs = self.env.reset() reset = True self.episode_reward = np.zeros((1, )) for _ in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True ''' Hierarchical Step (Start) ''' obs, new_obs, rew, action, done, reset = self.hierarchical_step( obs, ds_rewards, cnt, kwargs, update_eps) ''' Hierarchical Step (End) ''' if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += rew if done: if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True if self.num_timesteps > self.learning_starts and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: try: new_priorities = np.array([ abs(x) for x in td_errors.tolist() ]) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) except AssertionError: print(td_errors) if self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() self.num_timesteps += 1 return self, ds_rewards
class MADQN(OffPolicyRLModel): """ The DQN model class. DQN paper: https://arxiv.org/abs/1312.5602 Dueling DQN: https://arxiv.org/abs/1511.06581 Double-Q Learning: https://arxiv.org/abs/1509.06461 Prioritized Experience Replay: https://arxiv.org/abs/1511.05952 :param policy: (DQNPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) discount factor :param learning_rate: (float) learning rate for adam optimizer :param buffer_size: (int) size of the replay buffer :param exploration_fraction: (float) fraction of entire training period over which the exploration rate is annealed :param exploration_final_eps: (float) final value of random action probability :param exploration_initial_eps: (float) initial value of random action probability :param train_freq: (int) update the model every `train_freq` steps. set to None to disable printing :param batch_size: (int) size of a batched sampled from replay buffer for training :param double_q: (bool) Whether to enable Double-Q learning or not. :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param target_network_update_freq: (int) update the target network every `target_network_update_freq` steps. :param prioritized_replay: (bool) if True prioritized replay buffer will be used. :param prioritized_replay_alpha: (float)alpha parameter for prioritized replay buffer. It determines how much prioritization is used, with alpha=0 corresponding to the uniform case. :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities. :param param_noise: (bool) Whether or not to apply noise to the parameters of the policy. :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance :param full_tensorboard_log: (bool) enable additional logging when using tensorboard WARNING: this logging can take a lot of space quickly :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). If None (default), use random seed. Note that if you want completely deterministic results, you must set `n_cpu_tf_sess` to 1. :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations If None, the number of cpu of the current machine will be used. """ def __init__(self, policy, env, gamma=0.99, learning_rate=5e-4, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, exploration_initial_eps=1.0, train_freq=1, batch_size=32, double_q=True, learning_starts=1000, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, n_cpu_tf_sess=None, verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, num_agents=1): # MA-MOD # TODO: replay_buffer refactoring super(MADQN, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=DQNPolicy, requires_vec_env=False, policy_kwargs=policy_kwargs, seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) # print("POLICY TYPE", policy) if self.observation_space: obs_sp_low = self.observation_space.low[0, :] obs_sp_high = self.observation_space.high[0, :] self.observation_space = gym.spaces.Box(low=obs_sp_low, high=obs_sp_high) self.param_noise = param_noise self.learning_starts = learning_starts self.train_freq = train_freq self.prioritized_replay = prioritized_replay self.prioritized_replay_eps = prioritized_replay_eps self.batch_size = batch_size self.target_network_update_freq = target_network_update_freq self.prioritized_replay_alpha = prioritized_replay_alpha self.prioritized_replay_beta0 = prioritized_replay_beta0 self.prioritized_replay_beta_iters = prioritized_replay_beta_iters self.exploration_final_eps = exploration_final_eps self.exploration_initial_eps = exploration_initial_eps self.exploration_fraction = exploration_fraction self.buffer_size = buffer_size self.learning_rate = learning_rate self.gamma = gamma self.tensorboard_log = tensorboard_log self.full_tensorboard_log = full_tensorboard_log self.double_q = double_q self.num_agents = num_agents self.graph = None self.sess = None self._train_step = [] # MA-MOD self.step_model = [] # MA-MOD self.update_target = [] # MA-MOD self.act = [] # MA-MOD self.proba_step = [] # MA-MOD self.replay_buffer = None # TODO: Possibly try seperate replay buffer. If everything symmetric, OK for one. # If you have the same Value function, its fine. If you have seperate functions, if you have one replay buffer, they learn from the same data. self.beta_schedule = None self.exploration = None self.params = None self.summary = None if _init_setup_model: self.setup_model() def _get_pretrain_placeholders(self): assert False, "MAKE SURE THIS FUNCTION ISNT CALLED" policy = self.step_model return policy.obs_ph, tf.placeholder(tf.int32, [None]), policy.q_values def setup_model(self): with SetVerbosity(self.verbose): for i in range(self.num_agents): assert not isinstance(self.action_space, gym.spaces.Box), \ "Error: DQN cannot output a gym.spaces.Box action space." # If the policy is wrap in functool.partial (e.g. to disable dueling) # unwrap it to check the class type if isinstance(self.policy, partial): test_policy = self.policy.func else: test_policy = self.policy # print(test_policy.type) assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \ "an instance of DQNPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.params = [] print("AC SPC", self.action_space) for i in range(self.num_agents): with tf.variable_scope("agent" + str(i)): optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) act, _train_step, update_target, step_model = build_train( q_func=partial(self.policy, **self.policy_kwargs), ob_space=self.observation_space, ac_space=self.action_space, optimizer=optimizer, gamma=self.gamma, grad_norm_clipping=10, param_noise=self.param_noise, sess=self.sess, full_tensorboard_log= False, #self.full_tensorboard_log, double_q=self.double_q) self.act.append(act) self._train_step.append(_train_step) self.step_model.append(step_model) self.proba_step.append(step_model.proba_step) self.update_target.append(update_target) self.params.extend( tf_util.get_trainable_vars("agent" + str(i) + "/deepq")) print(self.params) # Initialize the parameters and copy them to the target network. tf_util.initialize( self.sess ) # TODO: copy this file, make two versions of the algorithm. for i in range(self.num_agents): self.update_target[i]( sess=self.sess ) # TODO: Not sure, seems like the best thing to do is try using each agents own target first. # self.summary = tf.summary.merge_all() def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) # callback = self._init_callback(callback) # with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ # as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) episode_rewards = [[0.0] * self.num_agents] #MA-MOD episode_successes = [] #callback.on_training_start(locals(), globals()) #callback.on_rollout_start() reset = True obs = self.env.reset() for _ in range(total_timesteps): # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): env_action = [] # MA-MOD for i in range(self.num_agents ): # MA-MOD. This is fine for one policy. action = self.act[i]( np.array(obs[i])[None], update_eps=update_eps, **kwargs )[0] # TODO: Is this the correct way to get the correct agent obs? env_action.append(action) reset = False new_obs, rew, done, info = self.env.step( env_action ) # NOUPDATE - env.step should take a vector of actions ''' Obs: x_me, x_opp --- agent 1. In env: x_1, x_2 Obs: x_me, x_opp -- agent 2. In env: x_2, x_1 Env: (n_agents, state_dim) ''' self.num_timesteps += 1 # Stop training if return value is False # if callback.on_step() is False: # break # Store transition in the replay buffer. # Loop for replay buffer -- either separate or joined. obs[agent_index], action[agent_index], reward[agent_index] # Joey: Does this look right to you? # print(obs, action, rew, new_obs, done) #print("obs",obs[0]) #print(action) #print("ac", action[0]) #print("rew", rew[0]) #print("done", done[0]) for num_agent in range(self.num_agents): self.replay_buffer.add(obs[num_agent], env_action[num_agent], rew[num_agent], new_obs[num_agent], float(done[num_agent])) obs = new_obs # if writer is not None: # ep_rew = np.array([rew]).reshape((1, -1)) # ep_done = np.array([done]).reshape((1, -1)) # tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, # self.num_timesteps) # TODO: current episode_rewards is a list, make it a list of lists where each list is the reward for each agent in all timesteps # append the newest reward to the end of each list for each agent for num_agent in range(self.num_agents): #MA-MOD episode_rewards[-1][num_agent] += rew[num_agent] if done.any(): maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append([0.0] * self.num_agents) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # callback.on_rollout_end() for i in range(self.num_agents): # MA-MOD # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking # if writer is not None: # # run loss backprop with summary, but once every 100 steps save the metadata # # (memory, compute time, ...) # if (1 + self.num_timesteps) % 100 == 0: # run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # run_metadata = tf.RunMetadata() # summary, td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1, # dones, weights, sess=self.sess, options=run_options, # run_metadata=run_metadata) # writer.add_run_metadata(run_metadata, 'step%d_agent%d' % (self.num_timesteps, i)) # else: # summary, td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1, # dones, weights, sess=self.sess) # writer.add_summary(summary, self.num_timesteps) # else: td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: # NOUPDATE - not inside main agent for loop new_priorities = np.abs( td_errors) + self.prioritized_replay_eps # NOUPDATE assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities( batch_idxes, new_priorities) # callback.on_rollout_start() if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. for i in range(self.num_agents): self.update_target[i](sess=self.sess) # MA-MOD if len(episode_rewards[-101:-1]) == 0: # MA-MOD mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) #MA-MOD # below is what's logged in terminal. num_episodes = len(episode_rewards) #MA-MOD if self.verbose >= 1 and done.any( ) and log_interval is not None and len( episode_rewards) % log_interval == 0: #MA-MOD logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() return self def predict( self, observation, agent_idx, state=None, mask=None, deterministic=True): # MA-MOD - added `agent_idx` as a parameter observation = np.array(observation) vectorized_env = self._is_vectorized_observation( observation, self.observation_space) observation = observation.reshape((-1, ) + self.observation_space.shape) with self.sess.as_default(): actions, _, _ = self.step_model[agent_idx].step( observation, deterministic=deterministic) if not vectorized_env: actions = actions[0] return actions, None # No one ever calls this, so we don't need it? def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): print("Should not be called") return None ''' observation = np.array(observation) vectorized_env = self._is_vectorized_observation(observation, self.observation_space) observation = observation.reshape((-1,) + self.observation_space.shape) actions_proba = self.proba_step(observation, state, mask) if actions is not None: # comparing the action distribution, to given actions actions = np.array([actions]) assert isinstance(self.action_space, gym.spaces.Discrete) actions = actions.reshape((-1,)) assert observation.shape[0] == actions.shape[0], "Error: batch sizes differ for actions and observations." actions_proba = actions_proba[np.arange(actions.shape[0]), actions] # normalize action proba shape actions_proba = actions_proba.reshape((-1, 1)) if logp: actions_proba = np.log(actions_proba) if not vectorized_env: if state is not None: raise ValueError("Error: The environment must be vectorized when using recurrent policies.") actions_proba = actions_proba[0] return actions_proba ''' def get_parameter_list(self): print(self.params) return self.params def save(self, save_path, cloudpickle=False): # params data = { "double_q": self.double_q, "param_noise": self.param_noise, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "prioritized_replay": self.prioritized_replay, "prioritized_replay_eps": self.prioritized_replay_eps, "batch_size": self.batch_size, "target_network_update_freq": self.target_network_update_freq, "prioritized_replay_alpha": self.prioritized_replay_alpha, "prioritized_replay_beta0": self.prioritized_replay_beta0, "prioritized_replay_beta_iters": self.prioritized_replay_beta_iters, "exploration_final_eps": self.exploration_final_eps, "exploration_fraction": self.exploration_fraction, "learning_rate": self.learning_rate, "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "n_cpu_tf_sess": self.n_cpu_tf_sess, "seed": self.seed, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs, "num_agents": self.num_agents } params_to_save = self.get_parameters() # print(params_to_save) self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle)
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN"): with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] obs = self.env.reset() reset = True self.episode_reward = np.zeros((1, )) for step in range(total_timesteps): if callback is not None: callback(locals(), globals()) # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(step) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(step) + self.exploration.value(step) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = self.env.step(env_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, step) episode_rewards[-1] += rew if done: if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True if step > self.learning_starts and step % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(step)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + step) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % step) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, step) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) if step > self.learning_starts and step % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", step) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(step))) logger.dump_tabular() return self
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) # callback = self._init_callback(callback) # with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ # as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) episode_rewards = [[0.0] * self.num_agents] #MA-MOD episode_successes = [] #callback.on_training_start(locals(), globals()) #callback.on_rollout_start() reset = True obs = self.env.reset() for _ in range(total_timesteps): # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): env_action = [] # MA-MOD for i in range(self.num_agents ): # MA-MOD. This is fine for one policy. action = self.act[i]( np.array(obs[i])[None], update_eps=update_eps, **kwargs )[0] # TODO: Is this the correct way to get the correct agent obs? env_action.append(action) reset = False new_obs, rew, done, info = self.env.step( env_action ) # NOUPDATE - env.step should take a vector of actions ''' Obs: x_me, x_opp --- agent 1. In env: x_1, x_2 Obs: x_me, x_opp -- agent 2. In env: x_2, x_1 Env: (n_agents, state_dim) ''' self.num_timesteps += 1 # Stop training if return value is False # if callback.on_step() is False: # break # Store transition in the replay buffer. # Loop for replay buffer -- either separate or joined. obs[agent_index], action[agent_index], reward[agent_index] # Joey: Does this look right to you? # print(obs, action, rew, new_obs, done) #print("obs",obs[0]) #print(action) #print("ac", action[0]) #print("rew", rew[0]) #print("done", done[0]) for num_agent in range(self.num_agents): self.replay_buffer.add(obs[num_agent], env_action[num_agent], rew[num_agent], new_obs[num_agent], float(done[num_agent])) obs = new_obs # if writer is not None: # ep_rew = np.array([rew]).reshape((1, -1)) # ep_done = np.array([done]).reshape((1, -1)) # tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, # self.num_timesteps) # TODO: current episode_rewards is a list, make it a list of lists where each list is the reward for each agent in all timesteps # append the newest reward to the end of each list for each agent for num_agent in range(self.num_agents): #MA-MOD episode_rewards[-1][num_agent] += rew[num_agent] if done.any(): maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append([0.0] * self.num_agents) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # callback.on_rollout_end() for i in range(self.num_agents): # MA-MOD # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking # if writer is not None: # # run loss backprop with summary, but once every 100 steps save the metadata # # (memory, compute time, ...) # if (1 + self.num_timesteps) % 100 == 0: # run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # run_metadata = tf.RunMetadata() # summary, td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1, # dones, weights, sess=self.sess, options=run_options, # run_metadata=run_metadata) # writer.add_run_metadata(run_metadata, 'step%d_agent%d' % (self.num_timesteps, i)) # else: # summary, td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1, # dones, weights, sess=self.sess) # writer.add_summary(summary, self.num_timesteps) # else: td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: # NOUPDATE - not inside main agent for loop new_priorities = np.abs( td_errors) + self.prioritized_replay_eps # NOUPDATE assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities( batch_idxes, new_priorities) # callback.on_rollout_start() if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. for i in range(self.num_agents): self.update_target[i](sess=self.sess) # MA-MOD if len(episode_rewards[-101:-1]) == 0: # MA-MOD mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) #MA-MOD # below is what's logged in terminal. num_episodes = len(episode_rewards) #MA-MOD if self.verbose >= 1 and done.any( ) and log_interval is not None and len( episode_rewards) % log_interval == 0: #MA-MOD logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() return self
class SAC(OffPolicyRLModel): """ Soft Actor-Critic (SAC) Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor, This implementation borrows code from original implementation (https://github.com/haarnoja/sac) from OpenAI Spinning Up (https://github.com/openai/spinningup) and from the Softlearning repo (https://github.com/rail-berkeley/softlearning/) Paper: https://arxiv.org/abs/1801.01290 Introduction to SAC: https://spinningup.openai.com/en/latest/algorithms/sac.html :param policy: (SACPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) the discount factor :param learning_rate: (float or callable) learning rate for adam optimizer, the same learning rate will be used for all networks (Q-Values, Actor and Value function) it can be a function of the current progress (from 1 to 0) :param buffer_size: (int) size of the replay buffer :param batch_size: (int) Minibatch size for each gradient update :param tau: (float) the soft update coefficient ("polyak update", between 0 and 1) :param ent_coef: (str or float) Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) Controlling exploration/exploitation trade-off. Set it to 'auto' to learn it automatically (and 'auto_0.1' for using 0.1 as initial value) :param train_freq: (int) Update the model every `train_freq` steps. :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param target_update_interval: (int) update the target network every `target_network_update_freq` steps. :param gradient_steps: (int) How many gradient update after each step :param target_entropy: (str or float) target entropy when learning ent_coef (ent_coef = 'auto') :param action_noise: (ActionNoise) the action noise type (None by default), this can help for hard exploration problem. Cf DDPG for the different action noise type. :param random_exploration: (float) Probability of taking a random action (as in an epsilon-greedy strategy) This is not needed for SAC normally but can help exploring when using HER + SAC. This hack was present in the original OpenAI Baselines repo (DDPG + HER) :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation :param full_tensorboard_log: (bool) enable additional logging when using tensorboard Note: this has no effect on SAC logging for now :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). If None (default), use random seed. Note that if you want completely deterministic results, you must set `n_cpu_tf_sess` to 1. :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations If None, the number of cpu of the current machine will be used. """ def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=50000, learning_starts=100, train_freq=1, batch_size=64, tau=0.005, ent_coef='auto', target_update_interval=1, gradient_steps=1, target_entropy='auto', action_noise=None, random_exploration=0.0, verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None): super(SAC, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=SACPolicy, requires_vec_env=False, policy_kwargs=policy_kwargs, seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.buffer_size = buffer_size self.learning_rate = learning_rate self.learning_starts = learning_starts self.train_freq = train_freq self.batch_size = batch_size self.tau = tau # In the original paper, same learning rate is used for all networks # self.policy_lr = learning_rate # self.qf_lr = learning_rate # self.vf_lr = learning_rate # Entropy coefficient / Entropy temperature # Inverse of the reward scale self.ent_coef = ent_coef self.target_update_interval = target_update_interval self.gradient_steps = gradient_steps self.gamma = gamma self.action_noise = action_noise self.random_exploration = random_exploration self.value_fn = None self.graph = None self.replay_buffer = None self.episode_reward = None self.sess = None self.tensorboard_log = tensorboard_log self.verbose = verbose self.params = None self.summary = None self.policy_tf = None self.target_entropy = target_entropy self.full_tensorboard_log = full_tensorboard_log self.obs_target = None self.target_policy = None self.actions_ph = None self.rewards_ph = None self.terminals_ph = None self.observations_ph = None self.action_target = None self.next_observations_ph = None self.value_target = None self.step_ops = None self.target_update_op = None self.infos_names = None self.entropy = None self.target_params = None self.learning_rate_ph = None self.processed_obs_ph = None self.processed_next_obs_ph = None self.log_ent_coef = None if _init_setup_model: self.setup_model() def _get_pretrain_placeholders(self): policy = self.policy_tf # Rescale deterministic_action = unscale_action(self.action_space, self.deterministic_action) return policy.obs_ph, self.actions_ph, deterministic_action def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy.obs_ph self.processed_next_obs_ph = self.target_policy.processed_obs self.action_target = self.target_policy.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None, ) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") with tf.variable_scope("model", reuse=False): # Create the policy # first return value corresponds to deterministic actions # policy_out corresponds to stochastic actions, used for training # logp_pi is the log probability of actions taken by the policy self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor( self.processed_obs_ph) # Monitor the entropy of the policy, # this is not used for training self.entropy = tf.reduce_mean(self.policy_tf.entropy) # Use two Q-functions to improve performance by reducing overestimation bias. qf1, qf2, value_fn = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph, create_qf=True, create_vf=True) # Q(s,a) qf1_pi, qf2_pi, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, create_qf=True, create_vf=False, reuse=True) # Q(s, pi(a|s)) # Target entropy is used when learning the entropy coefficient if self.target_entropy == 'auto': # automatically set target entropy if needed self.target_entropy = -np.prod( self.env.action_space.shape).astype(np.float32) else: # Force conversion # this will also throw an error for unexpected string self.target_entropy = float(self.target_entropy) # The entropy coefficient or entropy can be learned automatically # see Automating Entropy Adjustment for Maximum Entropy RL section # of https://arxiv.org/abs/1812.05905 if isinstance(self.ent_coef, str) and self.ent_coef.startswith('auto'): # Default initial value of ent_coef when learned init_value = 1.0 if '_' in self.ent_coef: init_value = float(self.ent_coef.split('_')[1]) assert init_value > 0., "The initial value of ent_coef must be greater than 0" self.log_ent_coef = tf.get_variable( 'log_ent_coef', dtype=tf.float32, initializer=np.log(init_value).astype(np.float32)) self.ent_coef = tf.exp(self.log_ent_coef) else: # Force conversion to float # this will throw an error if a malformed string (different from 'auto') # is passed self.ent_coef = float(self.ent_coef) with tf.variable_scope("target", reuse=False): # Create the value network _, _, value_target = self.target_policy.make_critics( self.processed_next_obs_ph, create_qf=False, create_vf=True) self.value_target = value_target with tf.variable_scope("loss", reuse=False): # Take the min of the two Q-Values (Double-Q Learning) min_qf_pi = tf.minimum(qf1_pi, qf2_pi) # Target for Q value regression q_backup = tf.stop_gradient(self.rewards_ph + (1 - self.terminals_ph) * self.gamma * self.value_target) # Compute Q-Function loss # TODO: test with huber loss (it would avoid too high values) qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1)**2) qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2)**2) # Compute the entropy temperature loss # it is used when the entropy coefficient is learned ent_coef_loss, entropy_optimizer = None, None if not isinstance(self.ent_coef, float): ent_coef_loss = -tf.reduce_mean( self.log_ent_coef * tf.stop_gradient(logp_pi + self.target_entropy)) entropy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) # Compute the policy loss # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi) policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi - qf1_pi) # NOTE: in the original implementation, they have an additional # regularization loss for the Gaussian parameters # this is not used for now # policy_loss = (policy_kl_loss + policy_regularization_loss) policy_loss = policy_kl_loss # Target for value fn regression # We update the vf towards the min of two Q-functions in order to # reduce overestimation bias from function approximation error. v_backup = tf.stop_gradient(min_qf_pi - self.ent_coef * logp_pi) value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup)**2) values_losses = qf1_loss + qf2_loss + value_loss # Policy train op # (has to be separate from value train op, because min_qf_pi appears in policy_loss) policy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize( policy_loss, var_list=get_vars('model/pi')) # Value train op value_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) values_params = get_vars('model/values_fn') source_params = get_vars("model/values_fn/vf") target_params = get_vars("target/values_fn/vf") # Polyak averaging for target variables self.target_update_op = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] # Control flow is used because sess.run otherwise evaluates in nondeterministic order # and we first need to compute the policy action before computing q values losses with tf.control_dependencies([policy_train_op]): train_values_op = value_optimizer.minimize( values_losses, var_list=values_params) self.infos_names = [ 'policy_loss', 'qf1_loss', 'qf2_loss', 'value_loss', 'entropy' ] # All ops to call during one training step self.step_ops = [ policy_loss, qf1_loss, qf2_loss, value_loss, qf1, qf2, value_fn, logp_pi, self.entropy, policy_train_op, train_values_op ] # Add entropy coefficient optimization operation if needed if ent_coef_loss is not None: with tf.control_dependencies([train_values_op]): ent_coef_op = entropy_optimizer.minimize( ent_coef_loss, var_list=self.log_ent_coef) self.infos_names += [ 'ent_coef_loss', 'ent_coef' ] self.step_ops += [ ent_coef_op, ent_coef_loss, self.ent_coef ] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('value_loss', value_loss) tf.summary.scalar('entropy', self.entropy) if ent_coef_loss is not None: tf.summary.scalar('ent_coef_loss', ent_coef_loss) tf.summary.scalar('ent_coef', self.ent_coef) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = get_vars("model") self.target_params = get_vars("target/values_fn/vf") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all() def _train_step(self, step, writer, learning_rate): # Sample a batch from the replay buffer batch = self.replay_buffer.sample(self.batch_size) batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = batch feed_dict = { self.observations_ph: batch_obs, self.actions_ph: batch_actions, self.next_observations_ph: batch_next_obs, self.rewards_ph: batch_rewards.reshape(self.batch_size, -1), self.terminals_ph: batch_dones.reshape(self.batch_size, -1), self.learning_rate_ph: learning_rate } # out = [policy_loss, qf1_loss, qf2_loss, # value_loss, qf1, qf2, value_fn, logp_pi, # self.entropy, policy_train_op, train_values_op] # Do one gradient step # and optionally compute log for tensorboard if writer is not None: out = self.sess.run([self.summary] + self.step_ops, feed_dict) summary = out.pop(0) writer.add_summary(summary, step) else: out = self.sess.run(self.step_ops, feed_dict) # Unpack to monitor losses and entropy policy_loss, qf1_loss, qf2_loss, value_loss, *values = out # qf1, qf2, value_fn, logp_pi, entropy, *_ = values entropy = values[4] if self.log_ent_coef is not None: ent_coef_loss, ent_coef = values[-2:] return policy_loss, qf1_loss, qf2_loss, value_loss, entropy, ent_coef_loss, ent_coef return policy_loss, qf1_loss, qf2_loss, value_loss, entropy def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if self.num_timesteps < self.learning_starts or np.random.rand( ) < self.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.action_space, unscaled_action) else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # inferred actions need to be transformed to environment action_space before stepping unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(unscaled_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done: if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv("episode reward", episode_rewards[-2]) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): if actions is not None: raise ValueError("Error: SAC does not have action probabilities.") warnings.warn( "Even though SAC has a Gaussian policy, it cannot return a distribution as it " "is squashed by a tanh before being scaled and outputed.") return None def predict(self, observation, state=None, mask=None, deterministic=True): observation = np.array(observation) vectorized_env = self._is_vectorized_observation( observation, self.observation_space) observation = observation.reshape((-1, ) + self.observation_space.shape) actions = self.policy_tf.step(observation, deterministic=deterministic) actions = actions.reshape( (-1, ) + self.action_space.shape) # reshape to the correct action shape actions = unscale_action( self.action_space, actions) # scale the output for the prediction if not vectorized_env: actions = actions[0] return actions, None def get_parameter_list(self): return (self.params + self.target_params) def save(self, save_path, cloudpickle=False): data = { "learning_rate": self.learning_rate, "buffer_size": self.buffer_size, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "batch_size": self.batch_size, "tau": self.tau, "ent_coef": self.ent_coef if isinstance(self.ent_coef, float) else 'auto', "target_entropy": self.target_entropy, # Should we also store the replay buffer? # this may lead to high memory usage # with all transition inside # "replay_buffer": self.replay_buffer "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "n_cpu_tf_sess": self.n_cpu_tf_sess, "seed": self.seed, "action_noise": self.action_noise, "random_exploration": self.random_exploration, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs } params_to_save = self.get_parameters() self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle)
def main(args): """ Train a DQN agent on cartpole env :param args: (Parsed Arguments) the input arguments """ with tf_utils.make_session(8) as sess: # Create the environment env = gym.make("CartPole-v0") # Create all the functions necessary to train the model act, train, update_target, _ = deepq.build_train( q_func=CustomPolicy, ob_space=env.observation_space, ac_space=env.action_space, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), sess=sess) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. tf_utils.initialize() update_target() episode_rewards = [0.0] obs = env.reset() for step in itertools.count(): # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(step))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) is_solved = step > 100 and mean_100ep_reward >= 200 if args.no_render and step > args.max_timesteps: break if is_solved: if args.no_render: break # Show off the result env.render() else: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if step > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( 32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if step % 1000 == 0: update_target() if done and len(episode_rewards) % 10 == 0: logger.record_tabular("steps", step) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(step))) logger.dump_tabular()