def __init__(self): self.hyperparams = {"n_steps": 1024, "nminibatches": 32, "cliprange": 0.4, "gamma": 0.996, "lam": 0.95, "learning_rate": LinearSchedule(1.0, initial_p=0.0002, final_p=0.001).value, "noptepochs": 4, "ent_coef": 0.002}
def train(algorithm='dqn', timesteps=2e5): # env = gym.make('LunarLander-v2') # This uses the library version of the Lunar Lander env. print('algorithm: ', algorithm) print('timesteps: ', timesteps) learning_rate = 0.001 if algorithm.lower() == 'dqn': env = LunarLander() model = DQN('MlpPolicy', env, learning_rate=learning_rate, prioritized_replay=True, verbose=1) elif algorithm.lower() == 'ppo2': n_envs = 4 env = SubprocVecEnv([lambda: LunarLander() for i in range(n_envs)]) schedule = LinearSchedule(int(float(timesteps)), 0.00001, 0.1).value model = PPO2('MlpPolicy', env, learning_rate=schedule, verbose=1) else: raise RuntimeError("Unknown algorithm. %s" % algorithm) # mean_reward, std_reward = evaluate_policy( # model, model.get_env(), n_eval_episodes=10) # Train the agent model.learn(total_timesteps=int(float(timesteps)), log_interval=10) # Save the agent model.save("trained_models/latest") now = datetime.now() dt_string = now.strftime("%Y-%m-%d_%H-%M-%S") model.save("trained_models/lunar_climber_%s-%s" % (algorithm.lower(), dt_string)) # #lot training progress # plt.plot(env.all_rewards) # plt.ylabel('Reward') # plt.xlabel('Timesteps') # plt.savefig('figures/stats-%s.png' % dt_string) print("Model trained!")
class DQN(OffPolicyRLModel): """ The DQN model class. DQN paper: https://arxiv.org/pdf/1312.5602.pdf :param policy: (DQNPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) discount factor :param learning_rate: (float) learning rate for adam optimizer :param buffer_size: (int) size of the replay buffer :param exploration_fraction: (float) fraction of entire training period over which the exploration rate is annealed :param exploration_final_eps: (float) final value of random action probability :param train_freq: (int) update the model every `train_freq` steps. set to None to disable printing :param batch_size: (int) size of a batched sampled from replay buffer for training :param checkpoint_freq: (int) how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. :param checkpoint_path: (str) replacement path used if you need to log to somewhere else than a temporary directory. :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param target_network_update_freq: (int) update the target network every `target_network_update_freq` steps. :param prioritized_replay: (bool) if True prioritized replay buffer will be used. :param prioritized_replay_alpha: (float) alpha parameter for prioritized replay buffer :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities. :param param_noise: (bool) Whether or not to apply noise to the parameters of the policy. :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance """ def __init__(self, policy, env, gamma=0.99, learning_rate=5e-4, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, verbose=0, tensorboard_log=None, _init_setup_model=True): # TODO: replay_buffer refactoring super(DQN, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=DQNPolicy, requires_vec_env=False) self.checkpoint_path = checkpoint_path self.param_noise = param_noise self.learning_starts = learning_starts self.train_freq = train_freq self.prioritized_replay = prioritized_replay self.prioritized_replay_eps = prioritized_replay_eps self.batch_size = batch_size self.target_network_update_freq = target_network_update_freq self.checkpoint_freq = checkpoint_freq self.prioritized_replay_alpha = prioritized_replay_alpha self.prioritized_replay_beta0 = prioritized_replay_beta0 self.prioritized_replay_beta_iters = prioritized_replay_beta_iters self.exploration_final_eps = exploration_final_eps self.exploration_fraction = exploration_fraction self.buffer_size = buffer_size self.learning_rate = learning_rate self.gamma = gamma self.tensorboard_log = tensorboard_log self.graph = None self.sess = None self._train_step = None self.step_model = None self.update_target = None self.act = None self.proba_step = None self.replay_buffer = None self.beta_schedule = None self.exploration = None self.params = None self.summary = None self.episode_reward = None if _init_setup_model: self.setup_model() def setup_model(self): with SetVerbosity(self.verbose): assert not isinstance(self.action_space, gym.spaces.Box), \ "Error: DQN cannot output a gym.spaces.Box action space." # If the policy is wrap in functool.partial (e.g. to disable dueling) # unwrap it to check the class type if isinstance(self.policy, partial): test_policy = self.policy.func else: test_policy = self.policy assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \ "an instance of DQNPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(graph=self.graph) optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) self.act, self._train_step, self.update_target, self.step_model = deepq.build_train( q_func=self.policy, ob_space=self.observation_space, ac_space=self.action_space, optimizer=optimizer, gamma=self.gamma, grad_norm_clipping=10, param_noise=self.param_noise, sess=self.sess) self.proba_step = self.step_model.proba_step self.params = find_trainable_variables("deepq") # Initialize the parameters and copy them to the target network. tf_util.initialize(self.sess) self.update_target(sess=self.sess) self.summary = tf.summary.merge_all() def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN"): with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] obs = self.env.reset() reset = True self.episode_reward = np.zeros((1, )) for step in range(total_timesteps): if callback is not None: callback(locals(), globals()) # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(step) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(step) + self.exploration.value(step) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = self.env.step(env_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, step) episode_rewards[-1] += rew if done: if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True if step > self.learning_starts and step % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(step)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + step) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % step) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, step) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) if step > self.learning_starts and step % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", step) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(step))) logger.dump_tabular() return self def predict(self, observation, state=None, mask=None, deterministic=True): observation = np.array(observation) vectorized_env = self._is_vectorized_observation( observation, self.observation_space) observation = observation.reshape((-1, ) + self.observation_space.shape) with self.sess.as_default(): actions, _, _ = self.step_model.step(observation, deterministic=deterministic) if not vectorized_env: actions = actions[0] return actions, None def action_probability(self, observation, state=None, mask=None): observation = np.array(observation) vectorized_env = self._is_vectorized_observation( observation, self.observation_space) observation = observation.reshape((-1, ) + self.observation_space.shape) actions_proba = self.proba_step(observation, state, mask) if not vectorized_env: if state is not None: raise ValueError( "Error: The environment must be vectorized when using recurrent policies." ) actions_proba = actions_proba[0] return actions_proba def save(self, save_path): # params data = { "checkpoint_path": self.checkpoint_path, "param_noise": self.param_noise, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "prioritized_replay": self.prioritized_replay, "prioritized_replay_eps": self.prioritized_replay_eps, "batch_size": self.batch_size, "target_network_update_freq": self.target_network_update_freq, "checkpoint_freq": self.checkpoint_freq, "prioritized_replay_alpha": self.prioritized_replay_alpha, "prioritized_replay_beta0": self.prioritized_replay_beta0, "prioritized_replay_beta_iters": self.prioritized_replay_beta_iters, "exploration_final_eps": self.exploration_final_eps, "exploration_fraction": self.exploration_fraction, "learning_rate": self.learning_rate, "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "_vectorize_action": self._vectorize_action } params = self.sess.run(self.params) self._save_to_file(save_path, data=data, params=params) @classmethod def load(cls, load_path, env=None, **kwargs): data, params = cls._load_from_file(load_path) model = cls(policy=data["policy"], env=env, _init_setup_model=False) model.__dict__.update(data) model.__dict__.update(kwargs) model.set_env(env) model.setup_model() restores = [] for param, loaded_p in zip(model.params, params): restores.append(param.assign(loaded_p)) model.sess.run(restores) return model
class DQN(OffPolicyRLModel): """ The DQN model class. DQN paper: https://arxiv.org/abs/1312.5602 Dueling DQN: https://arxiv.org/abs/1511.06581 Double-Q Learning: https://arxiv.org/abs/1509.06461 Prioritized Experience Replay: https://arxiv.org/abs/1511.05952 :param policy: (DQNPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) discount factor :param learning_rate: (float) learning rate for adam optimizer :param buffer_size: (int) size of the replay buffer :param exploration_fraction: (float) fraction of entire training period over which the exploration rate is annealed :param exploration_final_eps: (float) final value of random action probability :param exploration_initial_eps: (float) initial value of random action probability :param train_freq: (int) update the model every `train_freq` steps. set to None to disable printing :param batch_size: (int) size of a batched sampled from replay buffer for training :param double_q: (bool) Whether to enable Double-Q learning or not. :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param target_network_update_freq: (int) update the target network every `target_network_update_freq` steps. :param prioritized_replay: (bool) if True prioritized replay buffer will be used. :param prioritized_replay_alpha: (float)alpha parameter for prioritized replay buffer. It determines how much prioritization is used, with alpha=0 corresponding to the uniform case. :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities. :param param_noise: (bool) Whether or not to apply noise to the parameters of the policy. :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance :param full_tensorboard_log: (bool) enable additional logging when using tensorboard WARNING: this logging can take a lot of space quickly :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). If None (default), use random seed. Note that if you want completely deterministic results, you must set `n_cpu_tf_sess` to 1. :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations If None, the number of cpu of the current machine will be used. """ def __init__(self, policy, env, gamma=0.99, learning_rate=5e-4, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, exploration_initial_eps=1.0, train_freq=1, batch_size=32, double_q=True, learning_starts=1000, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, n_cpu_tf_sess=None, verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, expert_exp=None): # TODO: replay_buffer refactoring super(DQN, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=DQNPolicy, requires_vec_env=False, policy_kwargs=policy_kwargs, seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.param_noise = param_noise self.learning_starts = learning_starts self.train_freq = train_freq self.prioritized_replay = prioritized_replay self.prioritized_replay_eps = prioritized_replay_eps self.batch_size = batch_size self.target_network_update_freq = target_network_update_freq self.prioritized_replay_alpha = prioritized_replay_alpha self.prioritized_replay_beta0 = prioritized_replay_beta0 self.prioritized_replay_beta_iters = prioritized_replay_beta_iters self.exploration_final_eps = exploration_final_eps self.exploration_initial_eps = exploration_initial_eps self.exploration_fraction = exploration_fraction self.buffer_size = buffer_size self.learning_rate = learning_rate self.gamma = gamma self.tensorboard_log = tensorboard_log self.full_tensorboard_log = full_tensorboard_log self.double_q = double_q self.expert_exp = expert_exp self.expert_ix = 0 self.graph = None self.sess = None self._train_step = None self.step_model = None self.update_target = None self.act = None self.proba_step = None self.replay_buffer = None self.beta_schedule = None self.exploration = None self.params = None self.summary = None if _init_setup_model: self.setup_model() def _get_pretrain_placeholders(self): policy = self.step_model return policy.obs_ph, tf.placeholder(tf.int32, [None]), policy.q_values def setup_model(self): with SetVerbosity(self.verbose): assert not isinstance(self.action_space, gym.spaces.Box), \ "Error: DQN cannot output a gym.spaces.Box action space." # If the policy is wrap in functool.partial (e.g. to disable dueling) # unwrap it to check the class type if isinstance(self.policy, partial): test_policy = self.policy.func else: test_policy = self.policy assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \ "an instance of DQNPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) self.act, self._train_step, self.update_target, self.step_model = build_train( q_func=partial(self.policy, **self.policy_kwargs), ob_space=self.observation_space, ac_space=self.action_space, optimizer=optimizer, gamma=self.gamma, grad_norm_clipping=10, param_noise=self.param_noise, sess=self.sess, full_tensorboard_log=self.full_tensorboard_log, double_q=self.double_q ) self.proba_step = self.step_model.proba_step self.params = tf_util.get_trainable_vars("deepq") # Initialize the parameters and copy them to the target network. tf_util.initialize(self.sess) self.update_target(sess=self.sess) self.summary = tf.summary.merge_all() def add_expert_exp(self): # doesn't work with vec_normalized environments obs, action, reward, new_obs, done = self.expert_exp[self.expert_ix] self.replay_buffer.add(obs, action, reward, new_obs, float(done)) self.expert_ix = (self.expert_ix - 1) % len(self.expert_exp) def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_successes = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() reset = True obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() for _ in range(total_timesteps): # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) self.num_timesteps += 1 # Stop training if return value is False if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs().squeeze() reward_ = self._vec_normalize_env.get_original_reward().squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, rew # Store transition in the replay buffer. self.replay_buffer.add(obs_, action, reward_, new_obs_, float(done)) if self.expert_exp is not None: self.add_expert_exp() obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ if writer is not None: ep_rew = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += reward_ if done: maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: callback.on_rollout_end() # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample(self.batch_size, beta=self.beta_schedule.value(self.num_timesteps), env=self._vec_normalize_env) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size, env=self._vec_normalize_env) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs(td_errors) + self.prioritized_replay_eps assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities(batch_idxes, new_priorities) callback.on_rollout_start() if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() callback.on_training_end() return self def predict(self, observation, state=None, mask=None, deterministic=True): observation = np.array(observation) vectorized_env = self._is_vectorized_observation(observation, self.observation_space) observation = observation.reshape((-1,) + self.observation_space.shape) with self.sess.as_default(): actions, _, _ = self.step_model.step(observation, deterministic=deterministic) if not vectorized_env: actions = actions[0] return actions, None def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): observation = np.array(observation) vectorized_env = self._is_vectorized_observation(observation, self.observation_space) observation = observation.reshape((-1,) + self.observation_space.shape) actions_proba = self.proba_step(observation, state, mask) if actions is not None: # comparing the action distribution, to given actions actions = np.array([actions]) assert isinstance(self.action_space, gym.spaces.Discrete) actions = actions.reshape((-1,)) assert observation.shape[0] == actions.shape[0], "Error: batch sizes differ for actions and observations." actions_proba = actions_proba[np.arange(actions.shape[0]), actions] # normalize action proba shape actions_proba = actions_proba.reshape((-1, 1)) if logp: actions_proba = np.log(actions_proba) if not vectorized_env: if state is not None: raise ValueError("Error: The environment must be vectorized when using recurrent policies.") actions_proba = actions_proba[0] return actions_proba def get_parameter_list(self): return self.params def save(self, save_path, cloudpickle=False): # params data = { "double_q": self.double_q, "param_noise": self.param_noise, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "prioritized_replay": self.prioritized_replay, "prioritized_replay_eps": self.prioritized_replay_eps, "batch_size": self.batch_size, "target_network_update_freq": self.target_network_update_freq, "prioritized_replay_alpha": self.prioritized_replay_alpha, "prioritized_replay_beta0": self.prioritized_replay_beta0, "prioritized_replay_beta_iters": self.prioritized_replay_beta_iters, "exploration_final_eps": self.exploration_final_eps, "exploration_fraction": self.exploration_fraction, "learning_rate": self.learning_rate, "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "n_cpu_tf_sess": self.n_cpu_tf_sess, "seed": self.seed, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs } params_to_save = self.get_parameters() self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle)
class DQN(OffPolicyRLModel): """ The DQN model class. DQN paper: https://arxiv.org/abs/1312.5602 Dueling DQN: https://arxiv.org/abs/1511.06581 Double-Q Learning: https://arxiv.org/abs/1509.06461 Prioritized Experience Replay: https://arxiv.org/abs/1511.05952 :param policy: (DQNPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) discount factor :param learning_rate: (float) learning rate for adam optimizer :param buffer_size: (int) size of the replay buffer :param exploration_fraction: (float) fraction of entire training period over which the exploration rate is annealed :param exploration_final_eps: (float) final value of random action probability :param train_freq: (int) update the model every `train_freq` steps. set to None to disable printing :param batch_size: (int) size of a batched sampled from replay buffer for training :param double_q: (bool) Whether to enable Double-Q learning or not. :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param target_network_update_freq: (int) update the target network every `target_network_update_freq` steps. :param prioritized_replay: (bool) if True prioritized replay buffer will be used. :param prioritized_replay_alpha: (float)alpha parameter for prioritized replay buffer. It determines how much prioritization is used, with alpha=0 corresponding to the uniform case. :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities. :param param_noise: (bool) Whether or not to apply noise to the parameters of the policy. :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance :param full_tensorboard_log: (bool) enable additional logging when using tensorboard WARNING: this logging can take a lot of space quickly :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). If None (default), use random seed. Note that if you want completely deterministic results, you must set `n_cpu_tf_sess` to 1. :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations If None, the number of cpu of the current machine will be used. """ def __init__(self, policy, env, gamma=0.99, learning_rate=5e-4, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, double_q=True, learning_starts=1000, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, n_cpu_tf_sess=None, verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, k=4, temp_size=15): # TODO: replay_buffer refactoring super(DQN, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=DQNPolicy, requires_vec_env=False, policy_kwargs=policy_kwargs, seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.param_noise = param_noise self.learning_starts = learning_starts self.train_freq = train_freq self.prioritized_replay = prioritized_replay self.prioritized_replay_eps = prioritized_replay_eps self.batch_size = batch_size self.target_network_update_freq = target_network_update_freq self.prioritized_replay_alpha = prioritized_replay_alpha self.prioritized_replay_beta0 = prioritized_replay_beta0 self.prioritized_replay_beta_iters = prioritized_replay_beta_iters self.exploration_final_eps = exploration_final_eps self.exploration_fraction = exploration_fraction self.buffer_size = buffer_size self.learning_rate = learning_rate self.gamma = gamma self.tensorboard_log = tensorboard_log self.full_tensorboard_log = full_tensorboard_log self.double_q = double_q self.graph = None self.sess = None self._train_step = None self.step_model = None self.update_target = None self.act = None self.proba_step = None self.replay_buffer = None self.beta_schedule = None self.exploration = None self.params = None self.summary = None self.episode_reward = None self.observation_space = feature_utils.get_observertion_space() self.action_space = feature_utils.get_action_space() self.temp_buffer = [] self.temp_size = temp_size self.k = k if _init_setup_model: self.setup_model() def _get_pretrain_placeholders(self): policy = self.step_model return policy.obs_ph, tf.placeholder(tf.int32, [None]), policy.q_values def setup_model(self): with SetVerbosity(self.verbose): assert not isinstance(self.action_space, gym.spaces.Box), \ "Error: DQN cannot output a gym.spaces.Box action space." # If the policy is wrap in functool.partial (e.g. to disable dueling) # unwrap it to check the class type if isinstance(self.policy, partial): test_policy = self.policy.func else: test_policy = self.policy assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \ "an instance of DQNPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) self.act, self._train_step, self.update_target, self.step_model = deepq.build_train( q_func=partial(self.policy, **self.policy_kwargs), ob_space=self.observation_space, ac_space=self.action_space, optimizer=optimizer, gamma=self.gamma, grad_norm_clipping=10, param_noise=self.param_noise, sess=self.sess, full_tensorboard_log=self.full_tensorboard_log, double_q=self.double_q ) self.proba_step = self.step_model.proba_step self.params = tf_util.get_trainable_vars("deepq") # Initialize the parameters and copy them to the target network. tf_util.initialize(self.sess) self.update_target(sess=self.sess) self.summary = tf.summary.merge_all() def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None, save_interval=None, save_path=None): print('----------------------------------------------') print('| L E A R N |') print('----------------------------------------------') print("num timesteps = " + str(int(total_timesteps / 1000)) + 'k') print("save_interval = " + str(int(save_interval / 1000)) + 'k') print() k = 10 save_interval_st = save_interval new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # 升级 # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_win_rates = [0.0] episode_successes = [] obs, obs_nf = self.env.reset() reset = True self.episode_reward = np.zeros((1,)) self.win_rate = np.zeros((1,)) # print(obs_nf) """ 探索使用prune """ prev2s = [None, None] def input_formate(obs): return obs.transpose((1, 2, 0)) for _ in tqdm(range(total_timesteps)): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True # tf.summary.scalar('update_eps', update_eps) with self.sess.as_default(): # 永不探索 原本为update_eps=update_eps action = self.act(np.array(input_formate(obs))[None], update_eps=-1, **kwargs)[0] filter_action = random.randint(0, 5) if type(obs_nf) == tuple: obs_nf = obs_nf[0] filter_action = feature_utils.get_modify_act(obs_nf, filter_action, prev2s, nokick=True) filter_action = feature_utils.get_act_abs(obs_nf, filter_action, rang=8) # 统计100次filter_actions的概率 fil_acts = [] for _ in range(100): rand_act = random.randint(0, 5) fil_act = feature_utils.get_modify_act(obs_nf, rand_act, prev2s, nokick=True) fil_act = feature_utils.get_act_abs(obs_nf, fil_act, rang=8) fil_acts.append(fil_act) # print('fil', fil_acts) # print() fil_acts = np.eye(65)[fil_acts] # print('eye', fil_acts) # print() fil_acts = fil_acts.sum(axis=0) # print('sum', fil_acts) # print() if random.random() < update_eps: action = filter_action env_action = action reset = False new_obs, rew, done, info, new_obs_nf = self.env.step(env_action) # .ntc self.replay_buffer.add(input_formate(obs), action, rew, input_formate(new_obs), float(done), fil_acts) ''' HER ''' self.temp_buffer.append((obs, action, rew, new_obs, float(done), fil_acts)) if len(self.temp_buffer) >= self.temp_size: for t in range(self.temp_size): s, a, r, s_n, d, fa = self.temp_buffer[t] for k in range(self.k): _s = copy.deepcopy(s) _a = a _r = copy.deepcopy(r) _s_n = copy.deepcopy(s_n) future = np.random.randint(t, self.temp_size) s_f, _a_f, _, _, _, _ = self.temp_buffer[future] g_map = s_f[-2] _s[-1] = g_map # print(_s_n[-2][goal]) if (_s_n[-2] == g_map).all() or ( (_s[-2] == _s[-1]).all() and _a_f == a == 64): # 判断_s是否通过a到达goal # if (_s[-2]) or g == 64: # 是否为原地不动 # print('HER') _r = _r + 0.01 self.replay_buffer.add(input_formate(_s), a, _r, input_formate(_s_n), d, fa) self.temp_buffer.clear() obs = new_obs obs_nf = new_obs_nf if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_win = np.array([info]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) self.win_rate = total_rate_logger(self.win_rate, ep_win, ep_done, writer, self.num_timesteps, name='win_rate') episode_rewards[-1] += rew episode_win_rates[-1] += info if done: maybe_is_success = (rew > 0) # info.get('is_success') # .ntc if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs, obs_nf = self.env.reset() episode_rewards.append(0.0) episode_win_rates.append(0.0) reset = True prev2s = [None, None] # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # print('Sampling ... ...', self.num_timesteps) # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample(self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones, filter_actions = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None # print(rewards.shape) # print(dones.shape) # print(actions.shape) if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) # print("fils", filter_actions) # print("acts", actions) # print(' Training ... ...') if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors, kl_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, filter_actions, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors, kl_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, filter_actions, sess=self.sess) # print('er', pr[0]) # print('kl', pr[1]) # print('x', pr[2]) # print('y', pr[3]) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs(td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities(batch_idxes, new_priorities) if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) if len(episode_win_rates[-101:-1]) == 0: mean_100ep_win_rate = -np.inf else: mean_100ep_win_rate = round(float(np.mean(episode_win_rates[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("mean 100 win rate", mean_100ep_win_rate) logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() # save interval if self.num_timesteps >= save_interval_st: save_interval_st += save_interval s_path = save_path + '_' + str(int(self.num_timesteps / 1000)) + 'k.zip' self.save(save_path=s_path) self.num_timesteps += 1 return self def predict(self, observation, state=None, mask=None, deterministic=True): observation = np.array(observation) vectorized_env = self._is_vectorized_observation(observation, self.observation_space) observation = observation.reshape((-1,) + self.observation_space.shape) with self.sess.as_default(): actions, _, _ = self.step_model.step(observation, deterministic=deterministic) if not vectorized_env: actions = actions[0] return actions, None def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): observation = np.array(observation) vectorized_env = self._is_vectorized_observation(observation, self.observation_space) observation = observation.reshape((-1,) + self.observation_space.shape) actions_proba = self.proba_step(observation, state, mask) if actions is not None: # comparing the action distribution, to given actions actions = np.array([actions]) assert isinstance(self.action_space, gym.spaces.Discrete) actions = actions.reshape((-1,)) assert observation.shape[0] == actions.shape[0], "Error: batch sizes differ for actions and observations." actions_proba = actions_proba[np.arange(actions.shape[0]), actions] # normalize action proba shape actions_proba = actions_proba.reshape((-1, 1)) if logp: actions_proba = np.log(actions_proba) if not vectorized_env: if state is not None: raise ValueError("Error: The environment must be vectorized when using recurrent policies.") actions_proba = actions_proba[0] return actions_proba def get_parameter_list(self): return self.params def save(self, save_path, cloudpickle=False): # params data = { "double_q": self.double_q, "param_noise": self.param_noise, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "prioritized_replay": self.prioritized_replay, "prioritized_replay_eps": self.prioritized_replay_eps, "batch_size": self.batch_size, "target_network_update_freq": self.target_network_update_freq, "prioritized_replay_alpha": self.prioritized_replay_alpha, "prioritized_replay_beta0": self.prioritized_replay_beta0, "prioritized_replay_beta_iters": self.prioritized_replay_beta_iters, "exploration_final_eps": self.exploration_final_eps, "exploration_fraction": self.exploration_fraction, "learning_rate": self.learning_rate, "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "n_cpu_tf_sess": self.n_cpu_tf_sess, "seed": self.seed, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs } params_to_save = self.get_parameters() print('----------------------------------------------') print('| S A V E |') print('----------------------------------------------') print('load_path =', save_path) print("len_parm = ", len(params_to_save)) print() self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle) @classmethod def load(cls, load_path, env=None, custom_objects=None, **kwargs): print() print("**************** LOAD ****************************************************************") data, params = cls._load_from_file(load_path, custom_objects=custom_objects) if 'policy_kwargs' in kwargs and kwargs['policy_kwargs'] != data['policy_kwargs']: raise ValueError("The specified policy kwargs do not equal the stored policy kwargs. " "Stored kwargs: {}, specified kwargs: {}".format(data['policy_kwargs'], kwargs['policy_kwargs'])) model = cls(policy=data["policy"], env=None, _init_setup_model=False) model.__dict__.update(data) model.__dict__.update(kwargs) model.set_env(env) model.setup_model() model.load_parameters(params) return model
class DQN(OffPolicyRLModel): """ The DQN model class. DQN paper: https://arxiv.org/pdf/1312.5602.pdf :param policy: (DQNPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) discount factor :param learning_rate: (float) learning rate for adam optimizer :param buffer_size: (int) size of the replay buffer :param exploration_fraction: (float) fraction of entire training period over which the exploration rate is annealed :param exploration_final_eps: (float) final value of random action probability :param train_freq: (int) update the model every `train_freq` steps. set to None to disable printing :param batch_size: (int) size of a batched sampled from replay buffer for training :param checkpoint_freq: (int) how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. :param checkpoint_path: (str) replacement path used if you need to log to somewhere else than a temporary directory. :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param target_network_update_freq: (int) update the target network every `target_network_update_freq` steps. :param param_noise: (bool) Whether or not to apply noise to the parameters of the policy. :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance WARNING: this logging can take a lot of space quickly """ def __init__(self, policy, env, test_env=None, gamma=0.99, kappa=1.0, learning_rate=5e-4, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, target_network_update_freq=500, phi_grad_update_freq=1, seed=0, eval_episodes=5, param_noise=False, verbose=0, policy_phi=None, policy_phi_kwargs=None, _init_setup_model=True, policy_kwargs=None): # TODO: replay_buffer refactoring super(DQN, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=DQNPolicy, requires_vec_env=False, policy_kwargs=policy_kwargs, policy_phi=policy_phi, policy_phi_kwargs=policy_phi_kwargs) self.checkpoint_path = checkpoint_path self.param_noise = param_noise self.learning_starts = learning_starts self.train_freq = train_freq self.batch_size = batch_size self.target_network_update_freq = target_network_update_freq self.checkpoint_freq = checkpoint_freq self.exploration_final_eps = exploration_final_eps self.exploration_fraction = exploration_fraction self.buffer_size = buffer_size self.learning_rate = learning_rate self.gamma = gamma self.kappa = kappa self.seed = seed self.graph = None self.sess = None self._train_step = None self.step_model = None self.update_target = None self.act = None self.proba_step = None self.replay_buffer = None self.beta_schedule = None self.exploration = None self.params = None self.summary = None self.episode_reward = None self.test_env= test_env self.eval_episodes = eval_episodes self.phi_grad_update_freq = phi_grad_update_freq if _init_setup_model: self.setup_model() def _get_pretrain_placeholders(self): policy = self.step_model return policy.obs_ph, tf.placeholder(tf.int32, [None]), policy.q_values def setup_model(self): with SetVerbosity(self.verbose): assert not isinstance(self.action_space, gym.spaces.Box), \ "Error: DQN cannot output a gym.spaces.Box action space." # If the policy is wrap in functool.partial (e.g. to disable dueling) # unwrap it to check the class type if isinstance(self.policy, partial): test_policy = self.policy.func else: test_policy = self.policy assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \ "an instance of DQNPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(graph=self.graph) self._setup_learn(self.seed) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) #optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate, momentum=0.95, epsilon=0.01) self.act, self._train_step, self.update_target, self._train_phi_step, self.step_model, _ = deepq_kpi.build_train( q_func=partial(self.policy, **self.policy_kwargs), ob_space=self.observation_space, ac_space=self.action_space, optimizer=optimizer, gamma=self.gamma, kappa=self.kappa, grad_norm_clipping=10, param_noise=self.param_noise, sess=self.sess ) self.proba_step = self.step_model.proba_step self.params = tf_util.get_trainable_vars("deepq") @contextmanager def timed(msg): if self.verbose >= 1: print(colorize(msg, color='magenta')) start_time = time.time() yield print(colorize("done in {:.3f} seconds".format((time.time() - start_time)), color='magenta')) else: yield # Initialize the parameters and copy them to the target network. tf_util.initialize(self.sess) self.update_target(sess=self.sess) self.timed = timed self.summary = tf.summary.merge_all() def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) print("args are", self.kappa, self.phi_grad_update_freq, self.seed, np.random.randint(100)) with SetVerbosity(self.verbose): # Create the replay buffer self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) #self.exploration = PiecewiseSchedule([(0, 1.0), (int(1e6), 0.1), (int(1e7), 0.01)], outside_value=0.01) episode_rewards = [0.0] episode_successes = [] #td_errors_mean = [] #td_phi_errors_mean = [] obs = self.env.reset() reset = True self.episode_reward = np.zeros((1,)) for _ in range(total_timesteps): #if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. # if callback(locals(), globals()) is False: # break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size) weights, batch_idxes = np.ones_like(rewards), None # Use v_phi as zero until buffere is filled if self.num_timesteps <= self.buffer_size: weights = np.zeros_like(rewards) with self.sess.as_default(): #actions_policy = self.act(obses_t) actions_policy_phi = self.act(obses_tp1) _, td_errors = self._train_step(obses_t, actions, actions_policy_phi, actions_policy_phi, rewards, obses_tp1, obses_tp1, obses_t, obses_tp1, obses_tp1, dones, weights, sess=self.sess) #td_errors_mean.append(np.mean(td_errors)) if can_sample and self.kappa != 1.0 and self.num_timesteps >= self.buffer_size and \ self.num_timesteps % (self.phi_grad_update_freq * self.train_freq) == 0: #print("updating vf phi now", self.num_timesteps) #td_phi_err = [] for i in range(self.phi_grad_update_freq): #int(self.phi_grad_update_freq / self.train_freq)): obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size) weights, batch_idxes = np.ones_like(rewards), None with self.sess.as_default(): #actions_policy = self.act(obses_t) actions_policy_phi = self.act(obses_tp1) _, td_phi_errors = self._train_phi_step(obses_t, actions, actions_policy_phi, actions_policy_phi, rewards, obses_tp1, obses_tp1, obses_t, obses_tp1, obses_tp1, dones, weights, sess=self.sess) #_, q_values_st = self.q_value_st(obses_t, actions, actions_policy_phi, actions_policy_phi, rewards, obses_tp1, obses_tp1, obses_t, obses_tp1, obses_tp1, dones, weights, # sess=self.sess) #td_phi_err.append(np.mean(td_phi_errors)) #print("td errors after phi update", np.mean(td_phi_err)) #print("q vals", np.mean(q_values_st)) if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: with self.timed("eval time"): if self.test_env is not None and len(episode_rewards) % (10 * log_interval) == 0: eval_return, actual_return = self.evaluate_agent(self.test_env) else: eval_return, actual_return = None, None logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("eval return", eval_return) logger.record_tabular("actual return", actual_return) #logger.record_tabular("td errors", np.mean(td_errors_mean)) #logger.record_tabular("td errors phi", np.mean(td_phi_errors_mean)) logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() #td_errors_mean = [] #td_phi_errors_mean = [] if self.checkpoint_path is not None and self.num_timesteps % self.checkpoint_freq == 0: self.save(self.checkpoint_path) self.num_timesteps += 1 return self def predict(self, observation, state=None, mask=None, deterministic=True): observation = np.array(observation) vectorized_env = self._is_vectorized_observation(observation, self.observation_space) observation = observation.reshape((-1,) + self.observation_space.shape) with self.sess.as_default(): actions, _, _ = self.step_model.step(observation, deterministic=deterministic) if not vectorized_env: actions = actions[0] return actions, None def evaluate_agent(self, test_env): test_reward = [] test_actual_reward = [] for _ in range(1): #10): episode_actual_rew = 0 for _ in range(self.eval_episodes): obs_eval, done_eval = test_env.reset(), False episode_rew = 0 while not done_eval: action_eval, _ = self.predict(obs_eval) obs_eval, rew_eval, done_eval, _ = test_env.step(action_eval) episode_rew += rew_eval episode_actual_rew += test_env.get_actual_reward() test_reward.append(episode_rew) test_actual_reward.append(episode_actual_rew) obs_eval = test_env.reset() # random test #observation = np.array(obs_eval) #observation = observation.reshape((-1,) + self.observation_space.shape) #with self.sess.as_default(): # actions, _, _ = self.step_model.step(observation, deterministic=True) # print("action is", actions) # actions = self.act(observation, stochastic=False) # print("new action is", actions) return np.mean(test_reward), np.mean(test_actual_reward) def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): observation = np.array(observation) vectorized_env = self._is_vectorized_observation(observation, self.observation_space) observation = observation.reshape((-1,) + self.observation_space.shape) actions_proba = self.proba_step(observation, state, mask) if actions is not None: # comparing the action distribution, to given actions actions = np.array([actions]) assert isinstance(self.action_space, gym.spaces.Discrete) actions = actions.reshape((-1,)) assert observation.shape[0] == actions.shape[0], "Error: batch sizes differ for actions and observations." actions_proba = actions_proba[np.arange(actions.shape[0]), actions] # normalize action proba shape actions_proba = actions_proba.reshape((-1, 1)) if logp: actions_proba = np.log(actions_proba) if not vectorized_env: if state is not None: raise ValueError("Error: The environment must be vectorized when using recurrent policies.") actions_proba = actions_proba[0] return actions_proba def get_parameter_list(self): return self.params def save(self, save_path): # params data = { "checkpoint_path": self.checkpoint_path, "param_noise": self.param_noise, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "batch_size": self.batch_size, "target_network_update_freq": self.target_network_update_freq, "checkpoint_freq": self.checkpoint_freq, "exploration_final_eps": self.exploration_final_eps, "exploration_fraction": self.exploration_fraction, "learning_rate": self.learning_rate, "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs, "eval_episodes": self.eval_episodes, "phi_grad_update_freq": self.phi_grad_update_freq } params_to_save = self.get_parameters() self._save_to_file(save_path, data=data, params=params_to_save)
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, initial_p=1.0): self.actions_weights = [] self.actions_container = [] new_tb_log = self._init_num_timesteps(reset_num_timesteps) cnt = 0 ds_rewards = [[0, 0]] with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=initial_p, final_p=self.exploration_final_eps) episode_rewards = [0.0] obs = self.env.reset() reset = True self.episode_reward = np.zeros((1, )) for _ in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True ''' Hierarchical Step (Start) ''' obs, new_obs, rew, action, done, reset = self.hierarchical_step( obs, ds_rewards, cnt, kwargs, update_eps) ''' Hierarchical Step (End) ''' if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += rew if done: if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True if self.num_timesteps > self.learning_starts and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: try: new_priorities = np.array([ abs(x) for x in td_errors.tolist() ]) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) except AssertionError: print(td_errors) if self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() self.num_timesteps += 1 return self, ds_rewards
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_successes = [] obs = self.env.reset() reset = True ############################################################ # MODIFICATION: # Track list of actions taken each episode. This is # intentionally not a set so that we can use np.isin. action_list = list() ############################################################ for _ in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): #################################################### # MODIFICATION: # Rename variable from original, since it's now # going to come back as an array due to the # modified build_act function being used to # construct everything. action_arr = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] #################################################### # ORIGINAL: # action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] ######################################################## # MODIFICATION: # Get the best action that has not yet been taken this # episode. action = \ action_arr[np.argmin(np.isin(action_arr, action_list))] # Add this action to the list. action_list.append(action) ######################################################## env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += rew if done: #################################################### # MODIFICATION: # Clear the list. action_list.clear() #################################################### maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities( batch_idxes, new_priorities) if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() self.num_timesteps += 1 return self
class CustomDQN(DQN): """ Custom version of DQN (DQN). It is adapted from the stable-baselines version. Notable changes: - save replay buffer and restore it while loading """ def __init__(self, save_replay_buffer: bool = True, **kwargs): super(CustomDQN, self).__init__(**kwargs) self.save_replay_buffer = save_replay_buffer def learn( self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None, ): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name, new_tb_log) as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: if self.replay_buffer and len(self.replay_buffer) > 0: # TODO: maybe substitute with a prioritized buffer to give preference to the transitions added # during continual learning pass else: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0, ) else: if self.replay_buffer and len(self.replay_buffer) > 0: # TODO: maybe substitute with a prioritized buffer to give preference to the transitions added # during continual learning pass else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps, ) episode_rewards = [0.0] episode_successes = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() reset = True obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() for _ in range(total_timesteps): # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0.0 else: update_eps = 0.0 # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1.0 - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs["reset"] = reset kwargs[ "update_param_noise_threshold"] = update_param_noise_threshold kwargs["update_param_noise_scale"] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) self.num_timesteps += 1 # Stop training if return value is False if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs( ).squeeze() reward_ = self._vec_normalize_env.get_original_reward( ).squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, rew # Store transition in the replay buffer. self.replay_buffer.add(obs_, action, reward_, new_obs_, float(done)) obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ if writer is not None: ep_rew = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += reward_ if done: maybe_is_success = info.get("is_success") if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts and self.num_timesteps % self.train_freq == 0: callback.on_rollout_end() # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert ( self.beta_schedule is not None ), "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps), env=self._vec_normalize_env, ) ( obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes, ) = experience else: ( obses_t, actions, rewards, obses_tp1, dones, ) = self.replay_buffer.sample( self.batch_size, env=self._vec_normalize_env) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata, ) writer.add_run_metadata( run_metadata, "step%d" % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, ) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, ) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps assert isinstance( self.replay_buffer, PrioritizedReplayBuffer ), "replay_buffer should be an instance of PrioritizedReplayBuffer: {}".format( type(self.replay_buffer)) self.replay_buffer.update_priorities( batch_idxes, new_priorities) callback.on_rollout_start() if (can_sample and self.num_timesteps > self.learning_starts and self.num_timesteps % self.target_network_update_freq == 0): # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps)), ) logger.dump_tabular() callback.on_training_end() return self def save(self, save_path, cloudpickle=False): if self.save_replay_buffer: data = { "double_q": self.double_q, "param_noise": self.param_noise, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "prioritized_replay": self.prioritized_replay, "prioritized_replay_eps": self.prioritized_replay_eps, "batch_size": self.batch_size, "target_network_update_freq": self.target_network_update_freq, "prioritized_replay_alpha": self.prioritized_replay_alpha, "prioritized_replay_beta0": self.prioritized_replay_beta0, "prioritized_replay_beta_iters": self.prioritized_replay_beta_iters, "exploration_final_eps": self.exploration_final_eps, "exploration_fraction": self.exploration_fraction, "learning_rate": self.learning_rate, "replay_buffer": self.replay_buffer, "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "n_cpu_tf_sess": self.n_cpu_tf_sess, "seed": self.seed, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs, } else: data = { "double_q": self.double_q, "param_noise": self.param_noise, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "prioritized_replay": self.prioritized_replay, "prioritized_replay_eps": self.prioritized_replay_eps, "batch_size": self.batch_size, "target_network_update_freq": self.target_network_update_freq, "prioritized_replay_alpha": self.prioritized_replay_alpha, "prioritized_replay_beta0": self.prioritized_replay_beta0, "prioritized_replay_beta_iters": self.prioritized_replay_beta_iters, "exploration_final_eps": self.exploration_final_eps, "exploration_fraction": self.exploration_fraction, "learning_rate": self.learning_rate, "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "n_cpu_tf_sess": self.n_cpu_tf_sess, "seed": self.seed, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs, } params_to_save = self.get_parameters() self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle)
def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="BDQ", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) if self.epsilon_greedy: approximate_num_iters = 2e6 / 4 # TODO Decide which schedule type to use # self.exploration = PiecewiseSchedule([(0, 1.0), # (approximate_num_iters / 50, 0.1), # (approximate_num_iters / 5, 0.01) # ], outside_value=0.01) self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) else: self.exploration = ConstantSchedule(value=0.0) # greedy policy std_schedule = LinearSchedule(schedule_timesteps=self.timesteps_std, initial_p=self.initial_std, final_p=self.final_std) episode_rewards = [0.0] episode_successes = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() obs = self.env.reset() reset = True self.episode_reward = np.zeros((1,)) # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() for _ in range(total_timesteps): # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): # action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] # print("time step {} and update eps {}".format(self.num_timesteps, update_eps)) action_idxes = np.array(self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)) #update_eps=exploration.value(t))) action = action_idxes / self.num_action_grains * self.actions_range + self.low if not self.epsilon_greedy: # Gaussian noise actions_greedy = action action_idx_stoch = [] action = [] for index in range(len(actions_greedy)): a_greedy = actions_greedy[index] out_of_range_action = True while out_of_range_action: # Sample from a Gaussian with mean at the greedy action and a std following a schedule of choice a_stoch = np.random.normal(loc=a_greedy, scale=std_schedule.value(self.num_timesteps)) # Convert sampled cont action to an action idx a_idx_stoch = np.rint((a_stoch + self.high[index]) / self.actions_range[index] * self.num_action_grains) # Check if action is in range if a_idx_stoch >= 0 and a_idx_stoch < self.num_actions_pad: action_idx_stoch.append(a_idx_stoch) action.append(a_stoch) out_of_range_action = False action_idxes = action_idx_stoch env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) self.num_timesteps += 1 # Stop training if return value is False if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs().squeeze() reward_ = self._vec_normalize_env.get_original_reward().squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, rew # Store transition in the replay buffer. self.replay_buffer.add(obs_, action_idxes, reward_, new_obs_, float(done)) obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ if writer is not None: ep_rew = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) # self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, # self.num_timesteps) # episode_rewards[-1] += rew episode_rewards[-1] += reward_ if done: # print("ep number", len(episode_rewards)) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: callback.on_rollout_end() # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample(self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors, mean_loss = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors, mean_loss = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors, mean_loss = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs(td_errors) + self.prioritized_replay_eps assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities(batch_idxes, new_priorities) callback.on_rollout_start() if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) # Log training infos kvs = {} if self.verbose >= 1 and done and log_interval is not None \ and len(episode_rewards) % log_interval == 0 \ and self.num_timesteps > self.train_freq \ and self.num_timesteps > self.learning_starts: if self.log_dir is not None: kvs["episodes"] = num_episodes kvs["mean_100rew"] = mean_100ep_reward kvs["current_lr"] = self.learning_rate kvs["success_rate"] = np.mean(episode_successes[-100:]) kvs["total_timesteps"] = self.num_timesteps kvs["mean_loss"] = mean_loss kvs["mean_td_errors"] = np.mean(td_errors) kvs["time_spent_exploring"] = int(100 * self.exploration.value(self.num_timesteps)) self.log_csv.writekvs(kvs) logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() callback.on_training_end() return self
eval_env = gym.make('gym_docking:docking-v2') eval_callback = EvalCallback( eval_env, best_model_save_path='./logs/best_shaping_moving_b_10M_model', log_path='./logs/best_shaping_moving_b_10M_results', eval_freq=600) checkpoint_callback = CheckpointCallback( save_freq=int(5e4), save_path='./logs/', name_prefix='rl_model_621_shaping_moving_b_10M') # Create the callback list callback = CallbackList([checkpoint_callback, eval_callback]) lr_sch = LinearSchedule(int(10e6), 1.0e-5, 2.5e-4) model = PPO2( policy=MlpPolicy, env=env, verbose=1, tensorboard_log="./ppo2_docking_tensorboard/", policy_kwargs=dict(net_arch=[128, dict(pi=[128], vf=[128])], act_fun=tf.nn.relu), lam=0.95, gamma=0.99, # lower 0.9 ~ 0.99 # n_steps=math.floor(cfg['env']['max_time'] / cfg['env']['ctl_dt']), n_steps=600, ent_coef=0.00, learning_rate=3e-4, # learning_rate=lr_sch.value,
class DQN_HER(OffPolicyRLModel): """ The DQN model class. DQN paper: https://arxiv.org/pdf/1312.5602.pdf :param policy: (DQNPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) discount factor :param learning_rate: (float) learning rate for adam optimizer :param buffer_size: (int) size of the replay buffer :param exploration_fraction: (float) fraction of entire training period over which the exploration rate is annealed :param exploration_final_eps: (float) final value of random action probability :param train_freq: (int) update the model every `train_freq` steps. set to None to disable printing :param batch_size: (int) size of a batched sampled from replay buffer for training :param checkpoint_freq: (int) how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. :param checkpoint_path: (str) replacement path used if you need to log to somewhere else than a temporary directory. :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param target_network_update_freq: (int) update the target network every `target_network_update_freq` steps. :param prioritized_replay: (bool) if True prioritized replay buffer will be used. :param prioritized_replay_alpha: (float) alpha parameter for prioritized replay buffer :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities. :param param_noise: (bool) Whether or not to apply noise to the parameters of the policy. :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance """ def __init__(self, policy, env, hindsight, gamma=0.98, learning_rate=5e-4, buffer_size=2000000, exploration_fraction=0.01, exploration_final_eps=0.05, train_freq=1, batch_size=32, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, beta_fraction=1.0, prioritized_replay_eps=1e-6, param_noise=False, verbose=1, tensorboard_log=None, _init_setup_model=True, model_save_path="saved_model", model_save_episode_freq=-1, loop_breaking=True, multistep=1, boltzmann=False): # TODO: replay_buffer refactoring super(DQN_HER, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=DQNPolicy, requires_vec_env=False) self.checkpoint_path = checkpoint_path self.param_noise = param_noise self.learning_starts = learning_starts self.train_freq = train_freq self.prioritized_replay = prioritized_replay self.prioritized_replay_eps = prioritized_replay_eps self.batch_size = batch_size self.target_network_update_freq = target_network_update_freq self.checkpoint_freq = checkpoint_freq self.prioritized_replay_alpha = prioritized_replay_alpha self.prioritized_replay_beta0 = prioritized_replay_beta0 self.prioritized_replay_beta_iters = prioritized_replay_beta_iters self.beta_fraction = beta_fraction self.exploration_final_eps = exploration_final_eps self.exploration_fraction = exploration_fraction self.buffer_size = buffer_size self.learning_rate = learning_rate self.gamma = gamma self.hindsight = hindsight self.tensorboard_log = tensorboard_log self.model_save_path = model_save_path self.model_save_freq = model_save_episode_freq self.loop_breaking = loop_breaking self.multistep = multistep self.boltzmann = boltzmann self.graph = None self.sess = None self._train_step = None self.step_model = None self.update_target = None self.act = None self.proba_step = None self.replay_buffer = None self.solved_replay_buffer = None self.beta_schedule = None self.exploration = None self.params = None self.summary = None self.episode_reward = None self.steps_made = 0 self.episodes_completed = 0 self.solved_episodes = [] if _init_setup_model: self.setup_model() def setup_model(self): with SetVerbosity(self.verbose): assert not isinstance(self.action_space, gym.spaces.Box), \ "Error: DQN cannot output a gym.spaces.Box action space." # If the policy is wrap in functool.partial (e.g. to disable dueling) # unwrap it to check the class type if isinstance(self.policy, partial): test_policy = self.policy.func else: test_policy = self.policy assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \ "an instance of DQNPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(graph=self.graph) optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) # optimizer = tf.contrib.opt.NadamOptimizer(learning_rate=self.learning_rate) # optimizer = tf.train.MomentumOptimizer(learning_rate=1e-3, momentum=0.9, use_nesterov=True) self.act, self._train_step, self.update_target, self.step_model = deepq.build_train( q_func=self.policy, ob_space=self.observation_space, ac_space=self.action_space, optimizer=optimizer, gamma=self.gamma, grad_norm_clipping=10, param_noise=self.param_noise, sess=self.sess) self.proba_step = self.step_model.proba_step self.params = find_trainable_variables("deepq") # Initialize the parameters and copy them to the target network. tf_util.initialize(self.sess) self.update_target(sess=self.sess) self.summary = tf.summary.merge_all() def save_model_checkpoint(self): save_path = self.model_save_path + "_" + get_cur_time_str( ) + "_" + str(self.episodes_completed) print("Saving checkpoint to {0}".format(save_path), file=sys.stderr) self.save(save_path) def dump_solved_episodes(self): save_path = self.model_save_path + "_solvedEpisodes_" + get_cur_time_str( ) + "_" + str(self.episodes_completed) print('SOLVED episodes saved to {0}'.format(save_path)) with open(save_path, 'wb') as outfile: pickle.dump(self.solved_episodes, outfile) def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN"): with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) # Create the replay buffer if self.prioritized_replay: self.replay_buffer = SimplePrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps * self.beta_fraction self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: # self.replay_buffer = ReplayBuffer(self.buffer_size, gamma=self.gamma, hindsight=self.hindsight, multistep=self.multistep) self.replay_buffer = EpisodeReplayBuffer( self.buffer_size, hindsight=self.hindsight) self.solved_replay_buffer = EpisodeReplayBuffer( self.buffer_size, hindsight=self.hindsight) # self.replay_buffer = SimpleReplayBuffer(self.buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_trans = [] episode_replays = [] episode_success = [0] * log_interval episode_finals = [0] * log_interval episode_losses = [] is_in_loop = False loss_accumulator = [0.] * 50 episode_places = set() episode_div = [0] * log_interval full_obs = self.env.reset() part_obs = np.concatenate( (full_obs['observation'], full_obs['desired_goal']), axis=-1) begin_obs = [full_obs] * log_interval reset = True self.episode_reward = np.zeros((1, )) for step in range(total_timesteps): # self.steps_made += 1 # if step >= 7 * 100 * 150: # raise Exception("trigger") # curriculum # curriculum_scrambles = 1 + int(self.steps_made ** (0.50)) // 500 # curriculum_step_limit = min((curriculum_scrambles + 2) * 2, 100) # self.replay_buffer.set_sampling_cut(curriculum_step_limit) # self.env.scrambleSize = curriculum_scrambles # self.env.step_limit = curriculum_step_limit # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(step) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(step) + self.exploration.value(step) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): # Loop breaking if self.loop_breaking and is_in_loop: # update_eps_value = (update_eps + 1.) / 2. update_eps_value = 1. else: update_eps_value = update_eps if self.boltzmann: values = self.predict_q_values(np.array(part_obs))[0] exp = 1. / update_eps_value action = np.random.choice( np.arange(0, values.shape[0]), p=(exp**values) / sum(exp**values)) else: action = self.act(np.array(part_obs)[None], update_eps=update_eps_value, **kwargs)[0] # action = self.env.action_space.sample() env_action = action reset = False new_obs, rew, done, _ = self.env.step(env_action) current_place = None is_in_loop = False try: current_place = tuple(self.env.room_state.flatten()) except AttributeError: current_place = tuple(new_obs['observation'].flatten()) if current_place in episode_places: is_in_loop = True episode_places.add(current_place) # Store transition in the replay buffer. # self.replay_buffer.add(part_obs, action, rew, np.concatenate((new_obs['observation'], new_obs['desired_goal'])), float(done)) episode_replays.append( (full_obs, action, rew, new_obs, float(done))) episode_trans.append((full_obs, action, rew, new_obs)) full_obs = new_obs part_obs = np.concatenate( (full_obs['observation'], full_obs['desired_goal']), axis=-1) if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, step) episode_rewards[-1] += rew if done: if np.array_equal(full_obs['achieved_goal'], full_obs['desired_goal']): episode_success.append(1.) self.solved_episodes.append(episode_replays) else: episode_success.append(0.) episode_success = episode_success[1:] episode_div.append(len(episode_places)) episode_div = episode_div[1:] self.episodes_completed += 1 if self.model_save_freq > 0 and self.episodes_completed % self.model_save_freq == 0: self.save_model_checkpoint() if self.episodes_completed % (200 * 100) == 0: self.dump_solved_episodes() if not isinstance(self.env, VecEnv): full_obs = self.env.reset() # print(full_obs) part_obs = np.concatenate((full_obs['observation'], full_obs['desired_goal']), axis=-1) def postprocess_replays(raw_replays, buffer, prioritized_replay): if not prioritized_replay: buffer.add(raw_replays) return for _ in range(10): for id, (full_obs, action, rew, new_obs, done) in enumerate(raw_replays): offset = np.random.randint( id, len(raw_replays)) target = raw_replays[offset][3][ 'achieved_goal'] obs = np.concatenate( [full_obs['observation'], target], axis=-1) step = np.concatenate( [new_obs['observation'], target], axis=-1) if np.array_equal(new_obs['achieved_goal'], target): rew = 0. done = 1. else: rew = -1. done = 0. buffer.add(obs, action, rew, step, done) postprocess_replays(episode_replays, self.replay_buffer, self.prioritized_replay) begin_obs.append(full_obs) begin_obs = begin_obs[1:] if callback is not None: callback(locals(), globals()) episode_rewards.append(0.0) episode_trans = [] episode_replays = [] episode_places = set() episode_losses = [] reset = True is_in_loop = False if step > self.learning_starts and step % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(step)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience weights /= np.mean(weights) else: if np.random.randint(0, 100) < 100: # always obses_t, actions, rewards, obses_tp1, dones, info = self.replay_buffer.sample( self.batch_size) else: obses_t, actions, rewards, obses_tp1, dones, info = self.solved_replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + step) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % step) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, step) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if not self.prioritized_replay: for (dist, error) in zip(info, td_errors): if len(loss_accumulator) < dist + 1: loss_accumulator += [0.] * ( dist + 1 - len(loss_accumulator)) loss_accumulator[ dist] = loss_accumulator[dist] * 0.99 + huber( 1., error) # if step % 1000 == 0: # print('accumulator', [int(x) for x in loss_accumulator]) # weights_sum = sum(loss_accumulator) # print('normalized ', ['%.2f' % (x / weights_sum) for x in loss_accumulator]) # print('distance ', info) loss = np.mean( np.dot(weights, [huber(1., error) for error in td_errors])) episode_losses.append(loss) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) if step > self.learning_starts and step % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-(log_interval + 1):-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean( episode_rewards[-(log_interval + 1):-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", step) logger.record_tabular("episodes", num_episodes) logger.record_tabular( "mean {0} episode reward".format(log_interval), mean_100ep_reward) logger.record_tabular( "{0} episode success".format(log_interval), np.mean(episode_success)) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(step))) logger.dump_tabular() return self def predict(self, observation, state=None, mask=None, deterministic=True): observation = np.array(observation) vectorized_env = self._is_vectorized_observation( observation, self.observation_space) observation = observation.reshape((-1, ) + self.observation_space.shape) with self.sess.as_default(): actions, _, _ = self.step_model.step(observation, deterministic=deterministic) if not vectorized_env: actions = actions[0] return actions, None def predict_q_values(self, observation, state=None, mask=None, deterministic=True): observation = np.array(observation) observation = observation.reshape((-1, ) + self.observation_space.shape) with self.sess.as_default(): _, q_values, _ = self.step_model.step(observation, deterministic=deterministic) return q_values def action_probability(self, observation, state=None, mask=None): observation = np.array(observation) vectorized_env = self._is_vectorized_observation( observation, self.observation_space) observation = observation.reshape((-1, ) + self.observation_space.shape) actions_proba = self.proba_step(observation, state, mask) if not vectorized_env: if state is not None: raise ValueError( "Error: The environment must be vectorized when using recurrent policies." ) actions_proba = actions_proba[0] return actions_proba def save(self, save_path): # params data = { "checkpoint_path": self.checkpoint_path, "param_noise": self.param_noise, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "prioritized_replay": self.prioritized_replay, "prioritized_replay_eps": self.prioritized_replay_eps, "batch_size": self.batch_size, "target_network_update_freq": self.target_network_update_freq, "checkpoint_freq": self.checkpoint_freq, "prioritized_replay_alpha": self.prioritized_replay_alpha, "prioritized_replay_beta0": self.prioritized_replay_beta0, "prioritized_replay_beta_iters": self.prioritized_replay_beta_iters, "exploration_final_eps": self.exploration_final_eps, "exploration_fraction": self.exploration_fraction, "learning_rate": self.learning_rate, "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "_vectorize_action": self._vectorize_action, "hindsight": self.hindsight, "model_save_path": self.model_save_path, "model_save_freq": self.model_save_freq, } params = self.sess.run(self.params) self._save_to_file(save_path, data=data, params=params) @classmethod def load(cls, load_path, env=None, **kwargs): data, params = cls._load_from_file(load_path) print('loaded data:', data, 'kwargs:', kwargs) model = cls(policy=data["policy"], env=env, _init_setup_model=False) model.__dict__.update(data) model.__dict__.update(kwargs) model.set_env(env) model.setup_model() restores = [] for param, loaded_p in zip(model.params, params): restores.append(param.assign(loaded_p)) model.sess.run(restores) return model
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN"): with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) # Create the replay buffer if self.prioritized_replay: self.replay_buffer = SimplePrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps * self.beta_fraction self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: # self.replay_buffer = ReplayBuffer(self.buffer_size, gamma=self.gamma, hindsight=self.hindsight, multistep=self.multistep) self.replay_buffer = EpisodeReplayBuffer( self.buffer_size, hindsight=self.hindsight) self.solved_replay_buffer = EpisodeReplayBuffer( self.buffer_size, hindsight=self.hindsight) # self.replay_buffer = SimpleReplayBuffer(self.buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_trans = [] episode_replays = [] episode_success = [0] * log_interval episode_finals = [0] * log_interval episode_losses = [] is_in_loop = False loss_accumulator = [0.] * 50 episode_places = set() episode_div = [0] * log_interval full_obs = self.env.reset() part_obs = np.concatenate( (full_obs['observation'], full_obs['desired_goal']), axis=-1) begin_obs = [full_obs] * log_interval reset = True self.episode_reward = np.zeros((1, )) for step in range(total_timesteps): # self.steps_made += 1 # if step >= 7 * 100 * 150: # raise Exception("trigger") # curriculum # curriculum_scrambles = 1 + int(self.steps_made ** (0.50)) // 500 # curriculum_step_limit = min((curriculum_scrambles + 2) * 2, 100) # self.replay_buffer.set_sampling_cut(curriculum_step_limit) # self.env.scrambleSize = curriculum_scrambles # self.env.step_limit = curriculum_step_limit # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(step) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(step) + self.exploration.value(step) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): # Loop breaking if self.loop_breaking and is_in_loop: # update_eps_value = (update_eps + 1.) / 2. update_eps_value = 1. else: update_eps_value = update_eps if self.boltzmann: values = self.predict_q_values(np.array(part_obs))[0] exp = 1. / update_eps_value action = np.random.choice( np.arange(0, values.shape[0]), p=(exp**values) / sum(exp**values)) else: action = self.act(np.array(part_obs)[None], update_eps=update_eps_value, **kwargs)[0] # action = self.env.action_space.sample() env_action = action reset = False new_obs, rew, done, _ = self.env.step(env_action) current_place = None is_in_loop = False try: current_place = tuple(self.env.room_state.flatten()) except AttributeError: current_place = tuple(new_obs['observation'].flatten()) if current_place in episode_places: is_in_loop = True episode_places.add(current_place) # Store transition in the replay buffer. # self.replay_buffer.add(part_obs, action, rew, np.concatenate((new_obs['observation'], new_obs['desired_goal'])), float(done)) episode_replays.append( (full_obs, action, rew, new_obs, float(done))) episode_trans.append((full_obs, action, rew, new_obs)) full_obs = new_obs part_obs = np.concatenate( (full_obs['observation'], full_obs['desired_goal']), axis=-1) if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, step) episode_rewards[-1] += rew if done: if np.array_equal(full_obs['achieved_goal'], full_obs['desired_goal']): episode_success.append(1.) self.solved_episodes.append(episode_replays) else: episode_success.append(0.) episode_success = episode_success[1:] episode_div.append(len(episode_places)) episode_div = episode_div[1:] self.episodes_completed += 1 if self.model_save_freq > 0 and self.episodes_completed % self.model_save_freq == 0: self.save_model_checkpoint() if self.episodes_completed % (200 * 100) == 0: self.dump_solved_episodes() if not isinstance(self.env, VecEnv): full_obs = self.env.reset() # print(full_obs) part_obs = np.concatenate((full_obs['observation'], full_obs['desired_goal']), axis=-1) def postprocess_replays(raw_replays, buffer, prioritized_replay): if not prioritized_replay: buffer.add(raw_replays) return for _ in range(10): for id, (full_obs, action, rew, new_obs, done) in enumerate(raw_replays): offset = np.random.randint( id, len(raw_replays)) target = raw_replays[offset][3][ 'achieved_goal'] obs = np.concatenate( [full_obs['observation'], target], axis=-1) step = np.concatenate( [new_obs['observation'], target], axis=-1) if np.array_equal(new_obs['achieved_goal'], target): rew = 0. done = 1. else: rew = -1. done = 0. buffer.add(obs, action, rew, step, done) postprocess_replays(episode_replays, self.replay_buffer, self.prioritized_replay) begin_obs.append(full_obs) begin_obs = begin_obs[1:] if callback is not None: callback(locals(), globals()) episode_rewards.append(0.0) episode_trans = [] episode_replays = [] episode_places = set() episode_losses = [] reset = True is_in_loop = False if step > self.learning_starts and step % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(step)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience weights /= np.mean(weights) else: if np.random.randint(0, 100) < 100: # always obses_t, actions, rewards, obses_tp1, dones, info = self.replay_buffer.sample( self.batch_size) else: obses_t, actions, rewards, obses_tp1, dones, info = self.solved_replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + step) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % step) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, step) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if not self.prioritized_replay: for (dist, error) in zip(info, td_errors): if len(loss_accumulator) < dist + 1: loss_accumulator += [0.] * ( dist + 1 - len(loss_accumulator)) loss_accumulator[ dist] = loss_accumulator[dist] * 0.99 + huber( 1., error) # if step % 1000 == 0: # print('accumulator', [int(x) for x in loss_accumulator]) # weights_sum = sum(loss_accumulator) # print('normalized ', ['%.2f' % (x / weights_sum) for x in loss_accumulator]) # print('distance ', info) loss = np.mean( np.dot(weights, [huber(1., error) for error in td_errors])) episode_losses.append(loss) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) if step > self.learning_starts and step % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-(log_interval + 1):-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean( episode_rewards[-(log_interval + 1):-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", step) logger.record_tabular("episodes", num_episodes) logger.record_tabular( "mean {0} episode reward".format(log_interval), mean_100ep_reward) logger.record_tabular( "{0} episode success".format(log_interval), np.mean(episode_success)) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(step))) logger.dump_tabular() return self
class QLearning(): """ The tabular Q learning model Parameters ---------- policy: src.tabular.policies.Greedy Policy object env: gym.env The environment to learn from lr_initial: float Initial learning rate of the algorihtm (linear schedule) lr_final: float Final learning rate of the algorihtm (linear schedule) lr_fraction: float fraction of entire training period over which the learning rate is annealed temp_initial: float Initial temperature for softmax (linear schedule) temp_final: float Final temperature for softmax (linear schedule) temp_fraction: float fraction of entire training period over which the temperature is annealed exploration_epsilon: float Probability of taking a random action policy_kwargs: dict or None Keyword arguments for the policy verbose: int the verbosity level: 0 none, 1 training information """ def __init__(self, policy, env, lr_initial=0.5, lr_final=0.2, lr_fraction=0.5, temp_initial=10, temp_final=1, temp_fraction=.5, exploration_epsilon=0, policy_kwargs=None, verbose=1, _init_setup_model=True): self.policy = policy self.env = env self.verbose = verbose self.policy_kwargs = {} if policy_kwargs is None else policy_kwargs self.observation_space = None self.action_space = None self.n_envs = 1 self.num_timesteps = 0 self.params = None if self.env is not None: self.observation_space = env.observation_space self.action_space = env.action_space # Learning rate and softmax temperature self.eps = exploration_epsilon self.lr_initial = lr_initial self.lr_final = lr_final self.lr_fraction = lr_fraction self.temp_initial = temp_initial self.temp_final = temp_final self.temp_fraction = temp_fraction # Init acess points to the underlying policy before it is initialized self.step_model = None self._train_step = None self.act = None self.proba_step = None self.learning_rate = None self.temperature = None self.params = None self.summary = None self.episode_reward = None if _init_setup_model: self.setup_model() def get_env(self): """ returns the current environment (can be None if not defined) """ return self.env def set_env(self, env): """ Checks the validity of the environment, and if it is coherent, set it as the current environment. Parameters ---------- env: (Gym Environment) The environment for learning a policy """ if env is None and self.env is None: if self.verbose >= 1: print( "Loading a model without an environment, " "this model cannot be trained until it has a valid environment." ) return elif env is None: raise ValueError( "Error: trying to replace the current environment with None") # sanity checking the environment assert self.observation_space == env.observation_space, \ "Error: the environment passed must have at least the same observation space as the model was trained on." assert self.action_space == env.action_space, \ "Error: the environment passed must have at least the same action space as the model was trained on." self.env = env def _init_num_timesteps(self): """ Resets num_timesteps (total timesteps since beginning of training) """ self.num_timesteps = 0 def setup_model(self): """ Create all the functions necessary to train the model """ # Create policy object self.step_model = self.policy(self.observation_space, self.action_space, **self.policy_kwargs) def _act(obs, stochastic=True, update_eps=-1): """ Take an action for given observation Parameters ---------- obs: stochastic: bool If true do random move with prob self.eps update_eps: float Updated value for for self.eps (unchanged if < 0) Returns ------- """ # Update exploration parameters self.eps = update_eps if update_eps >= 0 else self.eps if stochastic and np.random.rand(1) < self.eps: return self.action_space.sample() # Uniform sampling else: return self.step_model.step( obs, deterministic=False) # Softmax sampling self.act = _act def _train_step(obses_t, actions, rewards, obses_tp1, dones, lr): n_transitions = len(np.atleast_1d(rewards)) if n_transitions == 1: old_Q = self.step_model.get_Q(obses_t, actions) best_Qtp1 = np.max(self.step_model.get_Q(obses_tp1)) update_Q = old_Q * (1 - lr) + lr * (rewards + best_Qtp1) self.step_model.update_Q(obses_t, actions, update_Q) else: raise NotImplementedError self._train_step = _train_step self.proba_step = self.step_model.proba_step self.params = lambda: { 'pi': self.step_model.pi, 'Q': self.step_model.Q } def _setup_learn(self, seed): """ check the environment, set the seed, and set the logger :param seed: (int) the seed value """ if self.env is None: raise ValueError( "Error: cannot train the model without a valid environment, please set an environment with" "set_env(self, env) method.") if seed is not None: set_global_seeds(seed) def get_parameter_list(self): """ Return policy and Q function estimate. """ self.step_model.update_full_policy( ) # Compute all probabilities with latest temperature return self.params() def get_parameters(self): """ return policy and Q function """ self.step_model.update_full_policy( ) # Compute all probabilities with latest temperature return self.params() def pretrain(self, dataset, n_epochs=10, learning_rate=1e-4, val_interval=None): """ Pretrain a model using behavior cloning: supervised learning given an expert dataset. :param dataset: (ExpertDataset) Dataset manager :param n_epochs: (int) Number of iterations on the training set :param learning_rate: (float) Learning rate :param val_interval: (int) Report training and validation losses every n epochs. By default, every 10th of the maximum number of epochs. :return: (QLearning model) the pretrained model """ raise NotImplementedError def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, reset_num_timesteps=True): """ Return a trained model. :param total_timesteps: (int) The total number of samples to train on :param seed: (int) The initial seed for training, if None: keep current seed :param callback: (function (dict, dict)) -> boolean function called at every steps with state of the algorithm. It takes the local and global variables. If it returns False, training is aborted. :param log_interval: (int) The number of timesteps before logging. :param reset_num_timesteps: (bool) whether or not to reset the current timestep number (used in logging) :return: (BaseRLModel) the trained model """ self._setup_learn(seed) self.learning_rate = LinearSchedule(schedule_timesteps=int( self.lr_fraction * total_timesteps), initial_p=self.lr_initial, final_p=self.lr_final) self.temperature = LinearSchedule(schedule_timesteps=int( self.temp_fraction * total_timesteps), initial_p=self.temp_initial, final_p=self.temp_final) # Initialize variables episode_rewards = [0.0] episode_successes = [] obs = self.env.reset() episode_length = 0 for _ in range(total_timesteps): num_episodes = len(episode_rewards) if callback is not None: # Only stop training if return value is False, not when it is None. if callback(locals(), globals()) is False: break # Act if hasattr(self.step_model, 'temperature'): self.step_model.temperature = self.temperature.value( self.num_timesteps) action = self.act(obs, update_eps=self.eps) new_obs, reward, done, info = self.env.step(action) episode_rewards[-1] += reward # Update Q self._train_step(obs, action, reward, new_obs, done, lr=self.learning_rate.value(self.num_timesteps)) obs = new_obs # Restart if necesary if done: maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) obs = self.env.reset() # print(np.mean(episode_rewards), len(episode_rewards)) episode_rewards.append(0.0) episode_length = 0 # Performance in last 100 episodes if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 6) # Logging if self.verbose >= 1 and done and log_interval is not None and num_episodes % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "Softmax temperature", int(self.temperature.value(self.num_timesteps))) logger.record_tabular( "Learning rate", int(self.learning_rate.value(self.num_timesteps))) logger.dump_tabular() self.num_timesteps += 1 episode_length += 1 return self def predict(self, observation, state=None, mask=None, deterministic=True): """ Get the model's action from an observation """ return self.step_model.step(observation, deterministic=deterministic), None def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): if actions is not None: raise NotImplementedError return self.step_model.proba_step(observation) def save(self, save_path): """ Save the current parameters to file :param save_path: (str or file-like object) the save location """ data = { "lr_initial": self.lr_initial, "lr_final": self.lr_final, "lr_fraction": self.lr_fraction, "temp_initial": self.temp_initial, "temp_final": self.temp_final, "temp_fraction": self.temp_fraction, "exploration_epsilon": self.eps, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "policy_kwargs": self.policy_kwargs } params = self.get_parameters() self._save_to_file(save_path, data=data, params=params) @staticmethod def _save_to_file(save_path, data=None, params=None): if isinstance(save_path, str): _, ext = os.path.splitext(save_path) if ext == "": save_path += ".pkl" with open(save_path, "wb") as file_: cloudpickle.dump((data, params), file_) else: # Here save_path is a file-like object, not a path cloudpickle.dump((data, params), save_path) def load_parameters(self, load_path_or_dict): """ Load model parameters from a file or a dictionary Dictionary keys should be tensorflow variable names, which can be obtained with ``get_parameters`` function. If ``exact_match`` is True, dictionary should contain keys for all model's parameters, otherwise RunTimeError is raised. If False, only variables included in the dictionary will be updated. This does not load agent's hyper-parameters. .. warning:: This function does not update trainer/optimizer variables (e.g. momentum). As such training after using this function may lead to less-than-optimal results. :param load_path_or_dict: (str or file-like or dict) Save parameter location or dict of parameters as variable.name -> ndarrays to be loaded. :param exact_match: (bool) If True, expects load dictionary to contain keys for all variables in the model. If False, loads parameters only for variables mentioned in the dictionary. Defaults to True. """ params = None if isinstance(load_path_or_dict, dict): # Assume `load_path_or_dict` is dict of variable.name -> ndarrays we want to load params = load_path_or_dict else: # Assume a filepath or file-like. # Use existing deserializer to load the parameters _, params = QLearning._load_from_file(load_path_or_dict) if self.step_model is not None: self.step_model.Q = params['Q'] self.step_model.pi = params['pi'] else: raise RuntimeError( 'Trying to load the parameters before policy instantiation') @classmethod def load(cls, load_path, env=None, **kwargs): """ Load the model from file :param load_path: (str or file-like) the saved parameter location :param env: (Gym Envrionment) the new environment to run the loaded model on (can be None if you only need prediction from a trained model) :param kwargs: extra arguments to change the model when loading """ data, params = cls._load_from_file(load_path) if 'policy_kwargs' in kwargs and kwargs['policy_kwargs'] != data[ 'policy_kwargs']: raise ValueError( "The specified policy kwargs do not equal the stored policy kwargs. " "Stored kwargs: {}, specified kwargs: {}".format( data['policy_kwargs'], kwargs['policy_kwargs'])) model = cls(policy=data["policy"], env=None, _init_setup_model=False) model.__dict__.update(data) model.__dict__.update(kwargs) model.set_env(env) model.setup_model() model.load_parameters(params) return model @staticmethod def _load_from_file(load_path): if isinstance(load_path, str): if not os.path.exists(load_path): if os.path.exists(load_path + ".pkl"): load_path += ".pkl" else: raise ValueError( "Error: the file {} could not be found".format( load_path)) with open(load_path, "rb") as file: data, params = cloudpickle.load(file) else: # Here load_path is a file-like object, not a path data, params = cloudpickle.load(load_path) return data, params @staticmethod def _softmax(x_input): """ An implementation of softmax. :param x_input: (numpy float) input vector :return: (numpy float) output vector """ x_exp = np.exp(x_input.T - np.max(x_input.T, axis=0)) return (x_exp / x_exp.sum(axis=0)).T
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, reset_num_timesteps=True): """ Return a trained model. :param total_timesteps: (int) The total number of samples to train on :param seed: (int) The initial seed for training, if None: keep current seed :param callback: (function (dict, dict)) -> boolean function called at every steps with state of the algorithm. It takes the local and global variables. If it returns False, training is aborted. :param log_interval: (int) The number of timesteps before logging. :param reset_num_timesteps: (bool) whether or not to reset the current timestep number (used in logging) :return: (BaseRLModel) the trained model """ self._setup_learn(seed) self.learning_rate = LinearSchedule(schedule_timesteps=int( self.lr_fraction * total_timesteps), initial_p=self.lr_initial, final_p=self.lr_final) self.temperature = LinearSchedule(schedule_timesteps=int( self.temp_fraction * total_timesteps), initial_p=self.temp_initial, final_p=self.temp_final) # Initialize variables episode_rewards = [0.0] episode_successes = [] obs = self.env.reset() episode_length = 0 for _ in range(total_timesteps): num_episodes = len(episode_rewards) if callback is not None: # Only stop training if return value is False, not when it is None. if callback(locals(), globals()) is False: break # Act if hasattr(self.step_model, 'temperature'): self.step_model.temperature = self.temperature.value( self.num_timesteps) action = self.act(obs, update_eps=self.eps) new_obs, reward, done, info = self.env.step(action) episode_rewards[-1] += reward # Update Q self._train_step(obs, action, reward, new_obs, done, lr=self.learning_rate.value(self.num_timesteps)) obs = new_obs # Restart if necesary if done: maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) obs = self.env.reset() # print(np.mean(episode_rewards), len(episode_rewards)) episode_rewards.append(0.0) episode_length = 0 # Performance in last 100 episodes if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 6) # Logging if self.verbose >= 1 and done and log_interval is not None and num_episodes % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "Softmax temperature", int(self.temperature.value(self.num_timesteps))) logger.record_tabular( "Learning rate", int(self.learning_rate.value(self.num_timesteps))) logger.dump_tabular() self.num_timesteps += 1 episode_length += 1 return self
class TradingDQN(DQN): def __init__(self, policy, env, gamma=0.9, batch_size=32, buffer_size=100000, learning_starts=10000, learning_rate=0.0001, target_network_update_freq=1000, exploration_final_eps=0.02, exploration_fraction=0.1, tensorboard_log=None, _init_setup_model=True): super().__init__(policy=policy, env=env, gamma=gamma, batch_size=batch_size, buffer_size=buffer_size, learning_starts=learning_starts, learning_rate=learning_rate, target_network_update_freq=target_network_update_freq, exploration_final_eps=exploration_final_eps, exploration_fraction=exploration_fraction, tensorboard_log=tensorboard_log, _init_setup_model=_init_setup_model) def setup_model(self): self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(graph=self.graph) # https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/deepq/build_graph.py self.act, self.train_step, self.update_target, self.step_model = deepq.build_train( q_func=self.policy, ob_space=self.env.observation_space, ac_space=self.env.action_space, optimizer=tf.train.AdamOptimizer( learning_rate=self.learning_rate), gamma=self.gamma, # grad_norm_clipping=1, sess=self.sess) self.params = find_trainable_variables('deepq') tf_util.initialize(self.sess) self.update_target(sess=self.sess) self.summary = tf.summary.merge_all() def learn(self, total_timesteps, seed=None, tb_log_name='DQN', test_interval=1, reset_num_timesteps=True): if reset_num_timesteps: self.num_timesteps = 0 with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) self.replay_buffer = ReplayBuffer(size=self.buffer_size) self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] obs = self.env.reset(train=True) best_train_score = None best_test_score = None self.reward_curve = [] for _ in range(total_timesteps): update_eps = self.exploration.value(self.num_timesteps) with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps)[0] new_obs, rew, done, _ = self.env.step(action) self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if self.num_timesteps > self.learning_starts: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights = np.ones_like(rewards) if writer is not None: if (1 + self.num_timesteps) % 100 == 0: summary, td_errors = self.train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self.train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.num_timesteps > self.learning_starts and self.num_timesteps % self.target_network_update_freq == 0: self.update_target(sess=self.sess) if done: print('-------------------------------------') print('steps | {}'.format( self.num_timesteps)) print('episodes | {}'.format( len(episode_rewards))) epsilon = int(100 * self.exploration.value(self.num_timesteps)) print('% time spent exploring | {}'.format(epsilon)) print('--') mean_100ep_reward = -np.inf if len( episode_rewards[-16:-1]) == 0 else round( float(np.mean(episode_rewards[-16:-1])), 1) self.reward_curve.append(mean_100ep_reward) print('mean 10 episode reward | {:.1f}'.format( mean_100ep_reward)) journal = self.env.sim.journal print('Total operations | {}'.format( len(self.env.sim.journal))) longs = [x for x in journal if x['Type'] == 'LONG'] shorts = [x for x in journal if x['Type'] == 'SHORT'] print('Long/Short | {}/{}'.format( len(longs), len(shorts))) print('Avg duration trades | {:.2f}'.format( np.mean([j['Trade Duration'] for j in journal]))) total_profit = sum([j['Profit'] for j in journal]) print('Total profit | {:.2f}'.format( total_profit)) print('Avg profit per trade | {:.3f}'.format( total_profit / self.env.sim.total_trades)) if epsilon <= self.exploration_final_eps * 100: if best_train_score is None or total_profit > best_train_score: self.save('saves/best_model_train.pkl') best_train_score = total_profit if self.num_timesteps % test_interval == 0: print('--') test_episode_rewards, test_longs, test_shorts, test_ave_profit_per_trade = self.test( ) print('Total profit test > {:.2f}'.format( test_episode_rewards)) print('Long/Short test > {}/{}'.format( test_longs, test_shorts)) print('Avg profit per trade test > {:.3f}'.format( test_ave_profit_per_trade)) if epsilon <= self.exploration_final_eps * 100: if best_test_score is None or test_episode_rewards > best_test_score: self.save('saves/best_model_test.pkl') best_test_score = test_episode_rewards print('-------------------------------------') obs = self.env.reset() episode_rewards.append(0.0) if self.num_timesteps + ( self.num_timesteps / len(episode_rewards)) >= total_timesteps: self.save('saves/final_model.pkl') break self.num_timesteps += 1 return self def test(self): obs = self.env.reset(train=False) done = False while not done: action, _ = self.predict(obs) obs, reward, done, info = self.env.step(action) journal = self.env.sim.journal longs = len([x for x in journal if x['Type'] == 'LONG']) shorts = len([x for x in journal if x['Type'] == 'SHORT']) test_episode_rewards = sum([j['Profit'] for j in journal]) test_ave_profit_per_trade = test_episode_rewards / self.env.sim.total_trades if self.env.sim.total_trades > 0 else -np.inf return test_episode_rewards, longs, shorts, test_ave_profit_per_trade def save(self, save_path): data = { 'batch_size': self.batch_size, 'learning_starts': self.learning_starts, 'learning_rate': self.learning_rate, 'target_network_update_freq': self.target_network_update_freq, 'exploration_final_eps': self.exploration_final_eps, 'exploration_fraction': self.exploration_fraction, 'gamma': self.gamma, 'policy': self.policy, 'journal': self.env.sim.journal, 'reward_curve': self.reward_curve } params = self.sess.run(self.params) self._save_to_file(save_path, data=data, params=params)
def learn(self, total_timesteps, seed=None, tb_log_name='DQN', test_interval=1, reset_num_timesteps=True): if reset_num_timesteps: self.num_timesteps = 0 with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) self.replay_buffer = ReplayBuffer(size=self.buffer_size) self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] obs = self.env.reset(train=True) best_train_score = None best_test_score = None self.reward_curve = [] for _ in range(total_timesteps): update_eps = self.exploration.value(self.num_timesteps) with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps)[0] new_obs, rew, done, _ = self.env.step(action) self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if self.num_timesteps > self.learning_starts: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights = np.ones_like(rewards) if writer is not None: if (1 + self.num_timesteps) % 100 == 0: summary, td_errors = self.train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self.train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.num_timesteps > self.learning_starts and self.num_timesteps % self.target_network_update_freq == 0: self.update_target(sess=self.sess) if done: print('-------------------------------------') print('steps | {}'.format( self.num_timesteps)) print('episodes | {}'.format( len(episode_rewards))) epsilon = int(100 * self.exploration.value(self.num_timesteps)) print('% time spent exploring | {}'.format(epsilon)) print('--') mean_100ep_reward = -np.inf if len( episode_rewards[-16:-1]) == 0 else round( float(np.mean(episode_rewards[-16:-1])), 1) self.reward_curve.append(mean_100ep_reward) print('mean 10 episode reward | {:.1f}'.format( mean_100ep_reward)) journal = self.env.sim.journal print('Total operations | {}'.format( len(self.env.sim.journal))) longs = [x for x in journal if x['Type'] == 'LONG'] shorts = [x for x in journal if x['Type'] == 'SHORT'] print('Long/Short | {}/{}'.format( len(longs), len(shorts))) print('Avg duration trades | {:.2f}'.format( np.mean([j['Trade Duration'] for j in journal]))) total_profit = sum([j['Profit'] for j in journal]) print('Total profit | {:.2f}'.format( total_profit)) print('Avg profit per trade | {:.3f}'.format( total_profit / self.env.sim.total_trades)) if epsilon <= self.exploration_final_eps * 100: if best_train_score is None or total_profit > best_train_score: self.save('saves/best_model_train.pkl') best_train_score = total_profit if self.num_timesteps % test_interval == 0: print('--') test_episode_rewards, test_longs, test_shorts, test_ave_profit_per_trade = self.test( ) print('Total profit test > {:.2f}'.format( test_episode_rewards)) print('Long/Short test > {}/{}'.format( test_longs, test_shorts)) print('Avg profit per trade test > {:.3f}'.format( test_ave_profit_per_trade)) if epsilon <= self.exploration_final_eps * 100: if best_test_score is None or test_episode_rewards > best_test_score: self.save('saves/best_model_test.pkl') best_test_score = test_episode_rewards print('-------------------------------------') obs = self.env.reset() episode_rewards.append(0.0) if self.num_timesteps + ( self.num_timesteps / len(episode_rewards)) >= total_timesteps: self.save('saves/final_model.pkl') break self.num_timesteps += 1 return self
randomBall=my_randomBall, binaryReward=my_binaryReward) # 0.01745*5 # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor # env = DummyVecEnv([lambda: env]) timesteps = 2000000 lr_start = 0.0005 # macht erst was bei 0.00014 lr_end = 0.00004 half_life = 0.1 dyn_lr = ExpLearningRate(timesteps=timesteps, lr_start=lr_start, lr_min=lr_end, half_life=half_life) llr = LinearSchedule(timesteps, 0.005, 0.0001) # default: 0.00025 my_learning_rate = dyn_lr.value # 0.000063 # my_learning_rate = scheduler.value # my_learning_rate = 0.00075 # scheduler.value default: 2.5e-4=0.00025 #print_LR = str(lr_start) + "-" + str(lr_end) print_LR = str(my_learning_rate) #static_learning_rate = 0.00014 # my_learning_rate.value #CRAZYDEEP7: #p_quarks = dict(net_arch=[8192, 8192, dict( # vf=[8192, 4096, 4096, 2048], pi=[256, 256, 128])]) #CRAZYDEEP7 Lite: #p_quarks = dict(net_arch=[4096, 4096, dict( # vf=[4096, 2048, 2048, 1024], pi=[256, 256, 128])])
class DqnAtml(DQN): def setup_model(self): with SetVerbosity(self.verbose): assert not isinstance(self.action_space, gym.spaces.Box), \ "Error: DQN cannot output a gym.spaces.Box action space." # If the policy is wrap in functool.partial (e.g. to disable dueling) # unwrap it to check the class type if isinstance(self.policy, partial): test_policy = self.policy.func else: test_policy = self.policy assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \ "an instance of DQNPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(graph=self.graph) optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) self.act, self._train_step, self.update_target, self.step_model = build_train_atml( q_func=partial(self.policy, **self.policy_kwargs), ob_space=self.observation_space, ac_space=self.action_space, optimizer=optimizer, gamma=self.gamma, grad_norm_clipping=10, param_noise=self.param_noise, sess=self.sess, full_tensorboard_log=self.full_tensorboard_log) self.proba_step = self.step_model.proba_step self.params = tf_util.get_trainable_vars("deepq") # Initialize the parameters and copy them to the target network. tf_util.initialize(self.sess) self.update_target(sess=self.sess) self.summary = tf.summary.merge_all() def get_actions_vec(self, actions_prims, actions_inputs, actions_mf): with self.sess.as_default(): self.embedd_matrix = self.step_model.embedding.get_weights() invalid_action = np.zeros(self.embedd_matrix[0].shape[1]) - 1 self.embedd_matrix = np.vstack([self.embedd_matrix[0], invalid_action]) embedded_steps = self.embedd_matrix[actions_prims.astype(int)] actions_inputs = actions_inputs.reshape(len(actions_prims), -1) actions_mf = actions_mf.reshape(len(actions_prims), -1) concat_actions = np.concatenate( (embedded_steps, actions_inputs, actions_mf), axis=1) flatten_act = concat_actions.reshape(-1) return flatten_act def process_state_vec(self, obs, state_info): # transform actions representation with embeddings with self.sess.as_default(): self.embedd_matrix = self.step_model.embedding.get_weights() ind1 = state_info['grid_prims_size'] ind2 = ind1 + state_info['relations_size'] ind3 = ind2 + state_info['ff_state_size'] ind4 = ind3 + state_info['action_prims'] ind5 = ind4 + state_info['action_inputs'] ind6 = ind5 + state_info['action_mf'] cells_num = state_info['cells_num'] actions_prims = obs[ind3:ind4] actions_inputs = obs[ind4:ind5] actions_mf = obs[ind5:] flatten_act = self.get_actions_vec(actions_prims, actions_inputs, actions_mf) final_obs = np.concatenate((obs[:ind3], flatten_act)) return final_obs def hierarchical_step(self, obs, ds_rewards, cnt, kwargs, update_eps): register = False while not register: with self.sess.as_default(): action = self.predict(np.array(obs)[None])[0][0] env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) level = info.get('hier_level') register = info.get('register') self.actions_container.append(env_action) self.actions_weights.append(level) if rew < 0 or register: with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] if rew < 0 and not register: self.actions_container = self.actions_container[:-1] self.actions_weights = self.actions_weights[:-1] rep_action = np.zeros(self.action_space.n) rep_action[action] = 1.0 if register: if rew > 0: ds_rewards.append([cnt, rew]) cnt += 1 self.actions_container = np.array(self.actions_container) self.actions_weights = np.array( self.actions_weights) / level b = np.zeros( (len(self.actions_container), self.action_space.n)) b[np.arange(len(self.actions_container)), self.actions_container.astype(int)] = 1 act_replay = np.sum((self.actions_weights * b.T).T, axis=0) rep_action = act_replay / np.sum(act_replay) self.actions_container = [] self.actions_weights = [] self.replay_buffer.add(obs, rep_action, rew, new_obs, float(done)) break obs = new_obs obs = new_obs return obs, new_obs, rew, action, done, reset def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, initial_p=1.0): self.actions_weights = [] self.actions_container = [] new_tb_log = self._init_num_timesteps(reset_num_timesteps) cnt = 0 ds_rewards = [[0, 0]] with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=initial_p, final_p=self.exploration_final_eps) episode_rewards = [0.0] obs = self.env.reset() reset = True self.episode_reward = np.zeros((1, )) for _ in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True ''' Hierarchical Step (Start) ''' obs, new_obs, rew, action, done, reset = self.hierarchical_step( obs, ds_rewards, cnt, kwargs, update_eps) ''' Hierarchical Step (End) ''' if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += rew if done: if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True if self.num_timesteps > self.learning_starts and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: try: new_priorities = np.array([ abs(x) for x in td_errors.tolist() ]) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) except AssertionError: print(td_errors) if self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() self.num_timesteps += 1 return self, ds_rewards
class MADQN(OffPolicyRLModel): """ The DQN model class. DQN paper: https://arxiv.org/abs/1312.5602 Dueling DQN: https://arxiv.org/abs/1511.06581 Double-Q Learning: https://arxiv.org/abs/1509.06461 Prioritized Experience Replay: https://arxiv.org/abs/1511.05952 :param policy: (DQNPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) discount factor :param learning_rate: (float) learning rate for adam optimizer :param buffer_size: (int) size of the replay buffer :param exploration_fraction: (float) fraction of entire training period over which the exploration rate is annealed :param exploration_final_eps: (float) final value of random action probability :param exploration_initial_eps: (float) initial value of random action probability :param train_freq: (int) update the model every `train_freq` steps. set to None to disable printing :param batch_size: (int) size of a batched sampled from replay buffer for training :param double_q: (bool) Whether to enable Double-Q learning or not. :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param target_network_update_freq: (int) update the target network every `target_network_update_freq` steps. :param prioritized_replay: (bool) if True prioritized replay buffer will be used. :param prioritized_replay_alpha: (float)alpha parameter for prioritized replay buffer. It determines how much prioritization is used, with alpha=0 corresponding to the uniform case. :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities. :param param_noise: (bool) Whether or not to apply noise to the parameters of the policy. :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance :param full_tensorboard_log: (bool) enable additional logging when using tensorboard WARNING: this logging can take a lot of space quickly :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). If None (default), use random seed. Note that if you want completely deterministic results, you must set `n_cpu_tf_sess` to 1. :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations If None, the number of cpu of the current machine will be used. """ def __init__(self, policy, env, gamma=0.99, learning_rate=5e-4, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, exploration_initial_eps=1.0, train_freq=1, batch_size=32, double_q=True, learning_starts=1000, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, n_cpu_tf_sess=None, verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, num_agents=1): # MA-MOD # TODO: replay_buffer refactoring super(MADQN, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=DQNPolicy, requires_vec_env=False, policy_kwargs=policy_kwargs, seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) # print("POLICY TYPE", policy) if self.observation_space: obs_sp_low = self.observation_space.low[0, :] obs_sp_high = self.observation_space.high[0, :] self.observation_space = gym.spaces.Box(low=obs_sp_low, high=obs_sp_high) self.param_noise = param_noise self.learning_starts = learning_starts self.train_freq = train_freq self.prioritized_replay = prioritized_replay self.prioritized_replay_eps = prioritized_replay_eps self.batch_size = batch_size self.target_network_update_freq = target_network_update_freq self.prioritized_replay_alpha = prioritized_replay_alpha self.prioritized_replay_beta0 = prioritized_replay_beta0 self.prioritized_replay_beta_iters = prioritized_replay_beta_iters self.exploration_final_eps = exploration_final_eps self.exploration_initial_eps = exploration_initial_eps self.exploration_fraction = exploration_fraction self.buffer_size = buffer_size self.learning_rate = learning_rate self.gamma = gamma self.tensorboard_log = tensorboard_log self.full_tensorboard_log = full_tensorboard_log self.double_q = double_q self.num_agents = num_agents self.graph = None self.sess = None self._train_step = [] # MA-MOD self.step_model = [] # MA-MOD self.update_target = [] # MA-MOD self.act = [] # MA-MOD self.proba_step = [] # MA-MOD self.replay_buffer = None # TODO: Possibly try seperate replay buffer. If everything symmetric, OK for one. # If you have the same Value function, its fine. If you have seperate functions, if you have one replay buffer, they learn from the same data. self.beta_schedule = None self.exploration = None self.params = None self.summary = None if _init_setup_model: self.setup_model() def _get_pretrain_placeholders(self): assert False, "MAKE SURE THIS FUNCTION ISNT CALLED" policy = self.step_model return policy.obs_ph, tf.placeholder(tf.int32, [None]), policy.q_values def setup_model(self): with SetVerbosity(self.verbose): for i in range(self.num_agents): assert not isinstance(self.action_space, gym.spaces.Box), \ "Error: DQN cannot output a gym.spaces.Box action space." # If the policy is wrap in functool.partial (e.g. to disable dueling) # unwrap it to check the class type if isinstance(self.policy, partial): test_policy = self.policy.func else: test_policy = self.policy # print(test_policy.type) assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \ "an instance of DQNPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.params = [] print("AC SPC", self.action_space) for i in range(self.num_agents): with tf.variable_scope("agent" + str(i)): optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) act, _train_step, update_target, step_model = build_train( q_func=partial(self.policy, **self.policy_kwargs), ob_space=self.observation_space, ac_space=self.action_space, optimizer=optimizer, gamma=self.gamma, grad_norm_clipping=10, param_noise=self.param_noise, sess=self.sess, full_tensorboard_log= False, #self.full_tensorboard_log, double_q=self.double_q) self.act.append(act) self._train_step.append(_train_step) self.step_model.append(step_model) self.proba_step.append(step_model.proba_step) self.update_target.append(update_target) self.params.extend( tf_util.get_trainable_vars("agent" + str(i) + "/deepq")) print(self.params) # Initialize the parameters and copy them to the target network. tf_util.initialize( self.sess ) # TODO: copy this file, make two versions of the algorithm. for i in range(self.num_agents): self.update_target[i]( sess=self.sess ) # TODO: Not sure, seems like the best thing to do is try using each agents own target first. # self.summary = tf.summary.merge_all() def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) # callback = self._init_callback(callback) # with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ # as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) episode_rewards = [[0.0] * self.num_agents] #MA-MOD episode_successes = [] #callback.on_training_start(locals(), globals()) #callback.on_rollout_start() reset = True obs = self.env.reset() for _ in range(total_timesteps): # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): env_action = [] # MA-MOD for i in range(self.num_agents ): # MA-MOD. This is fine for one policy. action = self.act[i]( np.array(obs[i])[None], update_eps=update_eps, **kwargs )[0] # TODO: Is this the correct way to get the correct agent obs? env_action.append(action) reset = False new_obs, rew, done, info = self.env.step( env_action ) # NOUPDATE - env.step should take a vector of actions ''' Obs: x_me, x_opp --- agent 1. In env: x_1, x_2 Obs: x_me, x_opp -- agent 2. In env: x_2, x_1 Env: (n_agents, state_dim) ''' self.num_timesteps += 1 # Stop training if return value is False # if callback.on_step() is False: # break # Store transition in the replay buffer. # Loop for replay buffer -- either separate or joined. obs[agent_index], action[agent_index], reward[agent_index] # Joey: Does this look right to you? # print(obs, action, rew, new_obs, done) #print("obs",obs[0]) #print(action) #print("ac", action[0]) #print("rew", rew[0]) #print("done", done[0]) for num_agent in range(self.num_agents): self.replay_buffer.add(obs[num_agent], env_action[num_agent], rew[num_agent], new_obs[num_agent], float(done[num_agent])) obs = new_obs # if writer is not None: # ep_rew = np.array([rew]).reshape((1, -1)) # ep_done = np.array([done]).reshape((1, -1)) # tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, # self.num_timesteps) # TODO: current episode_rewards is a list, make it a list of lists where each list is the reward for each agent in all timesteps # append the newest reward to the end of each list for each agent for num_agent in range(self.num_agents): #MA-MOD episode_rewards[-1][num_agent] += rew[num_agent] if done.any(): maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append([0.0] * self.num_agents) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # callback.on_rollout_end() for i in range(self.num_agents): # MA-MOD # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking # if writer is not None: # # run loss backprop with summary, but once every 100 steps save the metadata # # (memory, compute time, ...) # if (1 + self.num_timesteps) % 100 == 0: # run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # run_metadata = tf.RunMetadata() # summary, td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1, # dones, weights, sess=self.sess, options=run_options, # run_metadata=run_metadata) # writer.add_run_metadata(run_metadata, 'step%d_agent%d' % (self.num_timesteps, i)) # else: # summary, td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1, # dones, weights, sess=self.sess) # writer.add_summary(summary, self.num_timesteps) # else: td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: # NOUPDATE - not inside main agent for loop new_priorities = np.abs( td_errors) + self.prioritized_replay_eps # NOUPDATE assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities( batch_idxes, new_priorities) # callback.on_rollout_start() if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. for i in range(self.num_agents): self.update_target[i](sess=self.sess) # MA-MOD if len(episode_rewards[-101:-1]) == 0: # MA-MOD mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) #MA-MOD # below is what's logged in terminal. num_episodes = len(episode_rewards) #MA-MOD if self.verbose >= 1 and done.any( ) and log_interval is not None and len( episode_rewards) % log_interval == 0: #MA-MOD logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() return self def predict( self, observation, agent_idx, state=None, mask=None, deterministic=True): # MA-MOD - added `agent_idx` as a parameter observation = np.array(observation) vectorized_env = self._is_vectorized_observation( observation, self.observation_space) observation = observation.reshape((-1, ) + self.observation_space.shape) with self.sess.as_default(): actions, _, _ = self.step_model[agent_idx].step( observation, deterministic=deterministic) if not vectorized_env: actions = actions[0] return actions, None # No one ever calls this, so we don't need it? def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): print("Should not be called") return None ''' observation = np.array(observation) vectorized_env = self._is_vectorized_observation(observation, self.observation_space) observation = observation.reshape((-1,) + self.observation_space.shape) actions_proba = self.proba_step(observation, state, mask) if actions is not None: # comparing the action distribution, to given actions actions = np.array([actions]) assert isinstance(self.action_space, gym.spaces.Discrete) actions = actions.reshape((-1,)) assert observation.shape[0] == actions.shape[0], "Error: batch sizes differ for actions and observations." actions_proba = actions_proba[np.arange(actions.shape[0]), actions] # normalize action proba shape actions_proba = actions_proba.reshape((-1, 1)) if logp: actions_proba = np.log(actions_proba) if not vectorized_env: if state is not None: raise ValueError("Error: The environment must be vectorized when using recurrent policies.") actions_proba = actions_proba[0] return actions_proba ''' def get_parameter_list(self): print(self.params) return self.params def save(self, save_path, cloudpickle=False): # params data = { "double_q": self.double_q, "param_noise": self.param_noise, "learning_starts": self.learning_starts, "train_freq": self.train_freq, "prioritized_replay": self.prioritized_replay, "prioritized_replay_eps": self.prioritized_replay_eps, "batch_size": self.batch_size, "target_network_update_freq": self.target_network_update_freq, "prioritized_replay_alpha": self.prioritized_replay_alpha, "prioritized_replay_beta0": self.prioritized_replay_beta0, "prioritized_replay_beta_iters": self.prioritized_replay_beta_iters, "exploration_final_eps": self.exploration_final_eps, "exploration_fraction": self.exploration_fraction, "learning_rate": self.learning_rate, "gamma": self.gamma, "verbose": self.verbose, "observation_space": self.observation_space, "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, "n_cpu_tf_sess": self.n_cpu_tf_sess, "seed": self.seed, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs, "num_agents": self.num_agents } params_to_save = self.get_parameters() # print(params_to_save) self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle)
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) print("args are", self.kappa, self.phi_grad_update_freq, self.seed, np.random.randint(100)) with SetVerbosity(self.verbose): # Create the replay buffer self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) #self.exploration = PiecewiseSchedule([(0, 1.0), (int(1e6), 0.1), (int(1e7), 0.01)], outside_value=0.01) episode_rewards = [0.0] episode_successes = [] #td_errors_mean = [] #td_phi_errors_mean = [] obs = self.env.reset() reset = True self.episode_reward = np.zeros((1,)) for _ in range(total_timesteps): #if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. # if callback(locals(), globals()) is False: # break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size) weights, batch_idxes = np.ones_like(rewards), None # Use v_phi as zero until buffere is filled if self.num_timesteps <= self.buffer_size: weights = np.zeros_like(rewards) with self.sess.as_default(): #actions_policy = self.act(obses_t) actions_policy_phi = self.act(obses_tp1) _, td_errors = self._train_step(obses_t, actions, actions_policy_phi, actions_policy_phi, rewards, obses_tp1, obses_tp1, obses_t, obses_tp1, obses_tp1, dones, weights, sess=self.sess) #td_errors_mean.append(np.mean(td_errors)) if can_sample and self.kappa != 1.0 and self.num_timesteps >= self.buffer_size and \ self.num_timesteps % (self.phi_grad_update_freq * self.train_freq) == 0: #print("updating vf phi now", self.num_timesteps) #td_phi_err = [] for i in range(self.phi_grad_update_freq): #int(self.phi_grad_update_freq / self.train_freq)): obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size) weights, batch_idxes = np.ones_like(rewards), None with self.sess.as_default(): #actions_policy = self.act(obses_t) actions_policy_phi = self.act(obses_tp1) _, td_phi_errors = self._train_phi_step(obses_t, actions, actions_policy_phi, actions_policy_phi, rewards, obses_tp1, obses_tp1, obses_t, obses_tp1, obses_tp1, dones, weights, sess=self.sess) #_, q_values_st = self.q_value_st(obses_t, actions, actions_policy_phi, actions_policy_phi, rewards, obses_tp1, obses_tp1, obses_t, obses_tp1, obses_tp1, dones, weights, # sess=self.sess) #td_phi_err.append(np.mean(td_phi_errors)) #print("td errors after phi update", np.mean(td_phi_err)) #print("q vals", np.mean(q_values_st)) if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: with self.timed("eval time"): if self.test_env is not None and len(episode_rewards) % (10 * log_interval) == 0: eval_return, actual_return = self.evaluate_agent(self.test_env) else: eval_return, actual_return = None, None logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("eval return", eval_return) logger.record_tabular("actual return", actual_return) #logger.record_tabular("td errors", np.mean(td_errors_mean)) #logger.record_tabular("td errors phi", np.mean(td_phi_errors_mean)) logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() #td_errors_mean = [] #td_phi_errors_mean = [] if self.checkpoint_path is not None and self.num_timesteps % self.checkpoint_freq == 0: self.save(self.checkpoint_path) self.num_timesteps += 1 return self
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) # callback = self._init_callback(callback) # with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ # as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) episode_rewards = [[0.0] * self.num_agents] #MA-MOD episode_successes = [] #callback.on_training_start(locals(), globals()) #callback.on_rollout_start() reset = True obs = self.env.reset() for _ in range(total_timesteps): # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): env_action = [] # MA-MOD for i in range(self.num_agents ): # MA-MOD. This is fine for one policy. action = self.act[i]( np.array(obs[i])[None], update_eps=update_eps, **kwargs )[0] # TODO: Is this the correct way to get the correct agent obs? env_action.append(action) reset = False new_obs, rew, done, info = self.env.step( env_action ) # NOUPDATE - env.step should take a vector of actions ''' Obs: x_me, x_opp --- agent 1. In env: x_1, x_2 Obs: x_me, x_opp -- agent 2. In env: x_2, x_1 Env: (n_agents, state_dim) ''' self.num_timesteps += 1 # Stop training if return value is False # if callback.on_step() is False: # break # Store transition in the replay buffer. # Loop for replay buffer -- either separate or joined. obs[agent_index], action[agent_index], reward[agent_index] # Joey: Does this look right to you? # print(obs, action, rew, new_obs, done) #print("obs",obs[0]) #print(action) #print("ac", action[0]) #print("rew", rew[0]) #print("done", done[0]) for num_agent in range(self.num_agents): self.replay_buffer.add(obs[num_agent], env_action[num_agent], rew[num_agent], new_obs[num_agent], float(done[num_agent])) obs = new_obs # if writer is not None: # ep_rew = np.array([rew]).reshape((1, -1)) # ep_done = np.array([done]).reshape((1, -1)) # tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, # self.num_timesteps) # TODO: current episode_rewards is a list, make it a list of lists where each list is the reward for each agent in all timesteps # append the newest reward to the end of each list for each agent for num_agent in range(self.num_agents): #MA-MOD episode_rewards[-1][num_agent] += rew[num_agent] if done.any(): maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append([0.0] * self.num_agents) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # callback.on_rollout_end() for i in range(self.num_agents): # MA-MOD # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking # if writer is not None: # # run loss backprop with summary, but once every 100 steps save the metadata # # (memory, compute time, ...) # if (1 + self.num_timesteps) % 100 == 0: # run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # run_metadata = tf.RunMetadata() # summary, td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1, # dones, weights, sess=self.sess, options=run_options, # run_metadata=run_metadata) # writer.add_run_metadata(run_metadata, 'step%d_agent%d' % (self.num_timesteps, i)) # else: # summary, td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1, # dones, weights, sess=self.sess) # writer.add_summary(summary, self.num_timesteps) # else: td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: # NOUPDATE - not inside main agent for loop new_priorities = np.abs( td_errors) + self.prioritized_replay_eps # NOUPDATE assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities( batch_idxes, new_priorities) # callback.on_rollout_start() if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. for i in range(self.num_agents): self.update_target[i](sess=self.sess) # MA-MOD if len(episode_rewards[-101:-1]) == 0: # MA-MOD mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) #MA-MOD # below is what's logged in terminal. num_episodes = len(episode_rewards) #MA-MOD if self.verbose >= 1 and done.any( ) and log_interval is not None and len( episode_rewards) % log_interval == 0: #MA-MOD logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() return self
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None, save_interval=None, save_path=None): print('----------------------------------------------') print('| L E A R N |') print('----------------------------------------------') print("num timesteps = " + str(int(total_timesteps / 1000)) + 'k') print("save_interval = " + str(int(save_interval / 1000)) + 'k') print() k = 10 save_interval_st = save_interval new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # 升级 # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_win_rates = [0.0] episode_successes = [] obs, obs_nf = self.env.reset() reset = True self.episode_reward = np.zeros((1,)) self.win_rate = np.zeros((1,)) # print(obs_nf) """ 探索使用prune """ prev2s = [None, None] def input_formate(obs): return obs.transpose((1, 2, 0)) for _ in tqdm(range(total_timesteps)): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True # tf.summary.scalar('update_eps', update_eps) with self.sess.as_default(): # 永不探索 原本为update_eps=update_eps action = self.act(np.array(input_formate(obs))[None], update_eps=-1, **kwargs)[0] filter_action = random.randint(0, 5) if type(obs_nf) == tuple: obs_nf = obs_nf[0] filter_action = feature_utils.get_modify_act(obs_nf, filter_action, prev2s, nokick=True) filter_action = feature_utils.get_act_abs(obs_nf, filter_action, rang=8) # 统计100次filter_actions的概率 fil_acts = [] for _ in range(100): rand_act = random.randint(0, 5) fil_act = feature_utils.get_modify_act(obs_nf, rand_act, prev2s, nokick=True) fil_act = feature_utils.get_act_abs(obs_nf, fil_act, rang=8) fil_acts.append(fil_act) # print('fil', fil_acts) # print() fil_acts = np.eye(65)[fil_acts] # print('eye', fil_acts) # print() fil_acts = fil_acts.sum(axis=0) # print('sum', fil_acts) # print() if random.random() < update_eps: action = filter_action env_action = action reset = False new_obs, rew, done, info, new_obs_nf = self.env.step(env_action) # .ntc self.replay_buffer.add(input_formate(obs), action, rew, input_formate(new_obs), float(done), fil_acts) ''' HER ''' self.temp_buffer.append((obs, action, rew, new_obs, float(done), fil_acts)) if len(self.temp_buffer) >= self.temp_size: for t in range(self.temp_size): s, a, r, s_n, d, fa = self.temp_buffer[t] for k in range(self.k): _s = copy.deepcopy(s) _a = a _r = copy.deepcopy(r) _s_n = copy.deepcopy(s_n) future = np.random.randint(t, self.temp_size) s_f, _a_f, _, _, _, _ = self.temp_buffer[future] g_map = s_f[-2] _s[-1] = g_map # print(_s_n[-2][goal]) if (_s_n[-2] == g_map).all() or ( (_s[-2] == _s[-1]).all() and _a_f == a == 64): # 判断_s是否通过a到达goal # if (_s[-2]) or g == 64: # 是否为原地不动 # print('HER') _r = _r + 0.01 self.replay_buffer.add(input_formate(_s), a, _r, input_formate(_s_n), d, fa) self.temp_buffer.clear() obs = new_obs obs_nf = new_obs_nf if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_win = np.array([info]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) self.win_rate = total_rate_logger(self.win_rate, ep_win, ep_done, writer, self.num_timesteps, name='win_rate') episode_rewards[-1] += rew episode_win_rates[-1] += info if done: maybe_is_success = (rew > 0) # info.get('is_success') # .ntc if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs, obs_nf = self.env.reset() episode_rewards.append(0.0) episode_win_rates.append(0.0) reset = True prev2s = [None, None] # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # print('Sampling ... ...', self.num_timesteps) # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample(self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones, filter_actions = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None # print(rewards.shape) # print(dones.shape) # print(actions.shape) if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) # print("fils", filter_actions) # print("acts", actions) # print(' Training ... ...') if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors, kl_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, filter_actions, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors, kl_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, filter_actions, sess=self.sess) # print('er', pr[0]) # print('kl', pr[1]) # print('x', pr[2]) # print('y', pr[3]) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs(td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities(batch_idxes, new_priorities) if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) if len(episode_win_rates[-101:-1]) == 0: mean_100ep_win_rate = -np.inf else: mean_100ep_win_rate = round(float(np.mean(episode_win_rates[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("mean 100 win rate", mean_100ep_win_rate) logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() # save interval if self.num_timesteps >= save_interval_st: save_interval_st += save_interval s_path = save_path + '_' + str(int(self.num_timesteps / 1000)) + 'k.zip' self.save(save_path=s_path) self.num_timesteps += 1 return self
from stable_baselines.common.schedules import ConstantSchedule, LinearSchedule my_step_limit = 250 my_step_size = 0.01745*5 my_maxspeed = 1 my_randomBall = True my_binaryReward = True print("CARS_PPO2_DISCRETE.py LESS GO") env = CustomEnv(step_limit=my_step_limit, step_size = my_step_size, maxspeed = my_maxspeed, randomBall = my_randomBall, binaryReward= my_binaryReward) # 0.01745*5 # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor # env = DummyVecEnv([lambda: env]) timesteps = 150000 my_learning_rate = LinearSchedule(timesteps, 0.005, 0.0001) # default: 0.00025 name = "CARS_BNR_fixedShape_newObs_ppo2_LR_" + "LinearSchedule_" + "timesteps_" + str(timesteps) + "ep_length_" + str(my_step_limit) + "turnrate_" + str(my_step_size) + "maxspeed_" + str(my_maxspeed) + "randomBall_" + str(my_randomBall) + "binaryReward_" + str(my_binaryReward) # Configure tensorflow using GPU # Use tensorboard to show reward over time etc model = PPO2(MlpPolicy, env, learning_rate= my_learning_rate.value, verbose=1, tensorboard_log="/home/fritz/Documents/BA/TensorBoardLogs/CARS3") # defaults: learning_rate=2.5e-4, model.learn(total_timesteps=timesteps, tb_log_name= name) model.save("../Models/" + name) try: f = open("../Envparameters/envparameters_" + name, "x") f.write(str([my_step_limit, my_step_size, my_maxspeed, my_randomBall, my_binaryReward])) f.close()
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_successes = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() reset = True obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() for _ in range(total_timesteps): # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) self.num_timesteps += 1 # Stop training if return value is False if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs().squeeze() reward_ = self._vec_normalize_env.get_original_reward().squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, rew # Store transition in the replay buffer. self.replay_buffer.add(obs_, action, reward_, new_obs_, float(done)) if self.expert_exp is not None: self.add_expert_exp() obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ if writer is not None: ep_rew = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += reward_ if done: maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: callback.on_rollout_end() # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample(self.batch_size, beta=self.beta_schedule.value(self.num_timesteps), env=self._vec_normalize_env) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size, env=self._vec_normalize_env) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs(td_errors) + self.prioritized_replay_eps assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities(batch_idxes, new_priorities) callback.on_rollout_start() if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() callback.on_training_end() return self
def main(args): """ Train a DQN agent on cartpole env :param args: (Parsed Arguments) the input arguments """ with tf_utils.make_session(8) as sess: # Create the environment env = gym.make("CartPole-v0") # Create all the functions necessary to train the model act, train, update_target, _ = deepq.build_train( q_func=CustomPolicy, ob_space=env.observation_space, ac_space=env.action_space, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), sess=sess) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. tf_utils.initialize() update_target() episode_rewards = [0.0] obs = env.reset() for step in itertools.count(): # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(step))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) is_solved = step > 100 and mean_100ep_reward >= 200 if args.no_render and step > args.max_timesteps: break if is_solved: if args.no_render: break # Show off the result env.render() else: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if step > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( 32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if step % 1000 == 0: update_target() if done and len(episode_rewards) % 10 == 0: logger.record_tabular("steps", step) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(step))) logger.dump_tabular()
isEscaping=False, enemy_model=defender_model, enemy_step_limit=defender_step_limit, enemy_step_size=defender_step_size, enemy_maxspeed=defender_maxspeed, enemy_acceleration=defender_acceleration) env = DummyVecEnv([lambda: env]) env2 = DummyVecEnv([lambda: env2]) attacker_model.set_env(env) defender_model.set_env(env2) timesteps2 = 500000 scheduler = LinearSchedule(timesteps2, 0.001, 0.0001) my_learning_rate2 = scheduler.value for i in range(10000): defender_model.learn(total_timesteps=timesteps2, tb_log_name=defender_name + str(i), log_interval=100) attacker_model.learn(total_timesteps=timesteps2 // 2, tb_log_name=attacker_name + str(i), log_interval=100) if (i % 10 == 0): attacker_model.save("../Models/" + attacker_name + str(i)) defender_model.save("../Models/" + defender_name + str(i)) if (False):
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="A2C"): with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) self.learning_rate_schedule = Scheduler( initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) # Entropy tobe a large in the beginning self.ent_coef_schedule = LinearSchedule( schedule_timesteps=int(1e6), initial_p=0.1, final_p=0.01) runner = A2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma) self.episode_reward = np.zeros((self.n_envs, )) t_start = time.time() for update in range(1, total_timesteps // self.n_batch + 1): # true_reward is the reward without discount obs, states, rewards, masks, actions, values, true_reward = runner.run( ) _, value_loss, policy_entropy = self._train_step( update * self.n_batch, obs, states, rewards, masks, actions, values, update, writer) n_seconds = time.time() - t_start fps = int((update * self.n_batch) / n_seconds) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, update * (self.n_batch + 1)) if callback is not None: callback(locals(), globals()) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * self.n_batch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(explained_var)) logger.dump_tabular() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN"): with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] obs = self.env.reset() reset = True self.episode_reward = np.zeros((1, )) for step in range(total_timesteps): if callback is not None: callback(locals(), globals()) # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(step) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(step) + self.exploration.value(step) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = self.env.step(env_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, step) episode_rewards[-1] += rew if done: if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True if step > self.learning_starts and step % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(step)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + step) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % step) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, step) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) if step > self.learning_starts and step % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", step) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(step))) logger.dump_tabular() return self
#define custom network b=64 class CustomPolicy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs,net_arch=[dict(pi=[b,b], vf=[b,b])],feature_extraction="mlp") e_c=0.01 #define entropy coeff feedback='Bayes' #'Markov' or 'Bayes' steady=True #if True resets always with steady state conditions N=1 #number of parallel workers LRo=2.5e-4 #learning rate uact=False #if we want to use u as action (only Bayesian) TIMESTEPS=int(50e6) #training steps sched_LR=LinearSchedule(1,LRo,0) #lr schedule LR=sched_LR.value qs=1e-3 #feedback cost (only Bayesian) dirname='Tesi_bayestraj' #directory name title='feed{}_steady{}_lro{}_ts{}M_N{}_ec{}_u{}0.35_1e5_hurw_excss'.format(feedback,steady,LRo,TIMESTEPS/1e6,N,e_c,uact) #make checkpoint callback checkpoint_callback = CheckpointCallback(save_freq=int(100000/N), save_path='/home/fallani/prova/New/Cavity_checkpoint/{}/{}_q{}'.format(dirname,title,qs)) callback = checkpoint_callback #set parameters and start training params={'k':1,'eta':1,'X_kunit':0.35} #if a parameter is set to None it will be sampled from a uniform distribution at every reset args={'feedback':feedback,'q':qs,'uact':uact,'steadyreset':steady,'pow':0.5,'params':params,'plot':False}#i parametri di default son questi: rewfunc=Tools.purity_like_rew,q=1e-4,dt=1e-3,plot=False,pow=0.5 #instantiate environment env = make_vec_env(CavityEnv,n_envs=N,env_kwargs=args) #instantiate model model=PPO2(CustomPolicy,env,n_steps=128,learning_rate=LR,lam=0.95,ent_coef=e_c,verbose=1,nminibatches=4,noptepochs=4,tensorboard_log='/home/fallani/prova/New/TRAIN_Cavity/{}/{}_q{}'.format(dirname,title,qs),seed=1) #train the model