def train(self): self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics) while True: info = self.agent.step() if info['update']: logger.logkvs(info['update']) logger.dumpkvs() if self.agent.rollout.stats['tcount'] > self.num_timesteps: break self.agent.stop_interaction()
def learn(self, obses, actions, learning_rate=None): peer_actions = copy.deepcopy(actions) np.random.shuffle(peer_actions) feed_dict = { self.obs_ph: obses, self.actions_ph: actions[: None], self.peer_actions_ph: peer_actions[:, None], self.model.learning_rate_ph: learning_rate or self.learning_rate, self.peer_ph: self.peer, } train_loss, peer_loss, _ = self.model.sess.run( [self.loss, self.peer_term, self.optim_op], feed_dict) logger.logkv("copier loss", train_loss) logger.logkv("peer term", peer_loss) logger.dumpkvs()
def train( self, n_epochs: int = 100, *, on_epoch_end: Callable[[dict], None] = None, log_interval: int = 100, ): """Train with supervised learning for some number of epochs. Here an 'epoch' is just a complete pass through the expert transition dataset. Args: n_epochs: number of complete passes made through dataset. on_epoch_end: optional callback to run at the end of each epoch. Will receive all locals from this function as dictionary argument (!!). """ assert self.batch_size >= 1 samples_so_far = 0 batch_num = 0 for epoch_num in trange(n_epochs, desc="BC epoch"): while samples_so_far < (epoch_num + 1) * self.expert_dataset.size(): batch_num += 1 trans = self.expert_dataset.sample(self.batch_size) assert len(trans) == self.batch_size samples_so_far += self.batch_size feed_dict = { self._true_acts_ph: trans.acts, self.policy.obs_ph: trans.obs, } _, stats_dict = self.sess.run( [self._train_op, self._stats_dict], feed_dict=feed_dict ) stats_dict["epoch_num"] = epoch_num stats_dict["n_updates"] = batch_num stats_dict["batch_size"] = self.batch_size if batch_num % log_interval == 0: for k, v in stats_dict.items(): logger.logkv(k, v) logger.dumpkvs() batch_num += 1 if on_epoch_end is not None: on_epoch_end(locals())
def test_no_accum(tmpdir): logger.configure(tmpdir, ["csv"]) sb_logger.logkv("A", 1) sb_logger.logkv("B", 1) sb_logger.dumpkvs() sb_logger.logkv("A", 2) sb_logger.dumpkvs() sb_logger.logkv("B", 3) sb_logger.dumpkvs() expect = {"A": [1, 2, ""], "B": [1, "", 3]} _compare_csv_lines(osp.join(tmpdir, "progress.csv"), expect)
def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) bestscore = 0 new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) fig = plt.figure() ax = fig.add_subplot(111) x, y = [0], [0] with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() episode_stats = EpisodeStats(self.n_steps, self.n_envs) t_first_start = time.time() n_updates = total_timesteps // self.n_batch callback.on_training_start(locals(), globals()) for update in range(1, n_updates + 1): assert self.n_batch % self.nminibatches == 0, ( "The number of minibatches (`nminibatches`) " "is not a factor of the total number of samples " "collected per rollout (`n_batch`), " "some samples won't be used.") batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) callback.on_rollout_start() # true_reward is the reward without discount rollout = self.runner.run(callback) # Unpack obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = rollout callback.update_locals(locals()) callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break episode_stats.feed(true_reward, masks) self.ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = max( self.n_batch // self.nminibatches // self.noptepochs, 1) inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) else: # recurrent version update_fac = max( self.n_batch // self.nminibatches // self.noptepochs // self.n_steps, 1) assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, update=timestep, writer=writer, states=mb_states, cliprange_vf=cliprange_vf_now)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose == 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) #if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0: #logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf])) #logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf])) logger.logkv("mean_episode_length", episode_stats.mean_length()) logger.logkv("mean_episode_reward", episode_stats.mean_reward()) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if self.verbose == 2 and ( update % log_interval == 0 or update == 1) and episode_stats.mean_reward() > bestscore: bestscore = episode_stats.mean_reward() logger.logkv('time_elapsed', t_start - t_first_start) logger.logkv("mean_episode_reward", bestscore) logger.dumpkvs() x.append(self.num_timesteps) y.append(bestscore) ax.plot(x, y, marker='.', color='b') fig.canvas.draw() ax.set_xlim(left=0, right=total_timesteps) ax.set(title='Street Fighter 2 AI - PPO2 Algorithm', ylabel='Fitness score', xlabel='Timesteps') fig.show() plt.pause(0.001) callback.on_training_end() return self
def dumpkvs() -> None: """Alias for `stable_baselines.logger.dumpkvs`.""" sb_logger.dumpkvs()
def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO2"): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn() runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) self.episode_reward = np.zeros((self.n_envs,)) ep_info_buf = deque(maxlen=100) t_first_start = time.time() n_timesteps = 0 # nupdates = total_timesteps // self.n_batch for timestep in range(1, total_timesteps + 1): assert self.n_batch % self.nminibatches == 0 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - timestep / total_timesteps lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run() n_timesteps += len(obs) ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): # timestep = ((update * self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // # batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, writer=writer, update=n_timesteps)) else: # recurrent version assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for stan_timestepsrt in range(0, self.n_envs, envs_per_batch): # timestep = ((update * self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // # envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, update=n_timesteps, writer=writer, states=mb_states)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger(self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, n_timesteps) if self.verbose >= 1 and (timestep % log_interval == 0 or timestep == 1): explained_var = explained_variance(values, returns) logger.logkv("total_timesteps", n_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break if n_timesteps > total_timesteps: break return self
def learn(self, total_timesteps, env_eval, callback=None, seed=None, path=None, dis_path=None, score_path=None, dis_eval_interval=100, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): self.eval_env = env_eval new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() self.episode_reward = np.zeros((1,)) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] dis_eval_array = [] # (total_step % eval_intervel) x 2 x n_batch self.ep_length = 0 for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if (self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration): # No need to rescale when sampling random action rescaled_action = action = self.env.action_space.sample() noise = np.zeros(self.noise_dim) else: noise = self.policy_tf.gen_noise(obs[None]).flatten() action = self.policy_tf.step(obs[None],noise[None] ,deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) self.ep_length += 1 # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done), noise) episode_rewards[-1] += reward reset_flag = done or self.ep_length >= self.max_ep_length if reset_flag: if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() episode_rewards.append(0.0) self.ep_length = 0 maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) else: obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append(self._train_step(step, writer, current_lr, dis_eval_array, dis_eval_interval, dis_path)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) if self.num_timesteps % 2000 == 0: eval_ob = self.eval_env.reset() eval_epi_rewards = 0 eval_epis = 0 eval_performance = [] eval_ep_step = 0 while True: eval_noise = self.policy_tf.gen_noise(eval_ob[None]).flatten() eval_action = self.policy_tf.step(eval_ob[None], eval_noise[None], deterministic=True).flatten() eval_rescaled_action = eval_action * np.abs(self.action_space.low) eval_new_obs, eval_reward, eval_done, eval_info = self.eval_env.step(eval_rescaled_action) eval_epi_rewards += eval_reward eval_ob = eval_new_obs eval_ep_step += 1 if eval_done or eval_ep_step >= self.max_ep_length: eval_ob = self.eval_env.reset() eval_performance.append(eval_epi_rewards) eval_epi_rewards = 0 eval_epis += 1 eval_ep_step = 0 if eval_epis > 5: break with open(score_path, 'a') as f2: f2.write("%i %f\n" % (self.num_timesteps, np.mean(eval_performance))) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and reset_flag and log_interval is not None and len(episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) with open(path,'a') as f1: f1.write("%f " % step) f1.write("%f " % mean_reward) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) with open(path,'a') as f1: f1.write("%f " % safe_mean([ep_info['r'] for ep_info in ep_info_buf])) f1.write("%f " % safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) with open(path,'a') as f1: f1.write("%f " % n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="TD3", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() n_updates = 0 infos_values = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() for step in range(total_timesteps): # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if self.num_timesteps < self.learning_starts or np.random.rand( ) < self.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.action_space, unscaled_action) else: action = self.policy_tf.step(obs[None]).flatten() # Add noise to the action, as the policy # is deterministic, this is required for exploration if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(unscaled_action) self.num_timesteps += 1 # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. callback.update_locals(locals()) if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs( ).squeeze() reward_ = self._vec_normalize_env.get_original_reward( ).squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, reward # Store transition in the replay buffer. self.replay_buffer_add(obs_, action, reward_, new_obs_, done, info) obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: self.ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if self.num_timesteps % self.train_freq == 0: callback.on_rollout_end() mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) # Note: the policy is updated less frequently than the Q functions # this is controlled by the `policy_delay` parameter mb_infos_vals.append( self._train_step(step, writer, current_lr, (step + grad_step) % self.policy_delay == 0)) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) callback.on_rollout_start() episode_rewards[-1] += reward_ if done: if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) # substract 1 as we appended a new term just now num_episodes = len(episode_rewards) - 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and num_episodes % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'eplenmean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="SAC", print_freq=100): with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) start_time = time.time() episode_rewards = [0.0] is_teleop_env = hasattr(self.env, "wait_for_teleop_reset") # TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() else: obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) ep_len = 0 self.n_updates = 0 infos_values = [] mb_infos_vals = [] for step in range(total_timesteps): # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy. if step < self.learning_starts: action = self.env.action_space.sample() # No need to rescale when sampling random action rescaled_action = action else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) ep_len += 1 if print_freq > 0 and ep_len % print_freq == 0 and ep_len > 0: print("{} steps".format(ep_len)) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, step) if ep_len > self.train_freq: print("Additional training") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) done = True episode_rewards[-1] += reward if done: if not (isinstance(self.env, VecEnv) or is_teleop_env): obs = self.env.reset() print("Episode finished. Reward: {:.2f} {} Steps".format( episode_rewards[-1], ep_len)) episode_rewards.append(0.0) ep_len = 0 mb_infos_vals = self.optimize(step, writer, current_lr) # Refresh obs when using TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv( 'ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", self.n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', "{:.2f}".format(time.time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", step) logger.dumpkvs() # Reset infos: infos_values = [] if is_teleop_env: self.env.is_training = False # Use last batch print("Final optimization before saving") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True, vae=None): # making the learning rate and clip range callable here. self.writer2 = tf.summary.FileWriter('/tmp/ppo/PPO_test', graph=self.graph) self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) new_tb_log = self._init_num_timesteps( reset_num_timesteps=reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) self.episode_reward = np.zeros((self.n_envs, )) ep_info_buf = deque(maxlen=100) t_first_start = time.time() nupdates = total_timesteps // self.n_batch for update in range(1, nupdates + 1): assert self.n_batch % self.nminibatches == 0 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / nupdates lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run( ) ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, writer=writer, update=timestep)) self.num_timesteps += (self.n_batch * self.noptepochs ) // batch_size * update_fac """Optimize the VAE""" time_start = time.time() vae.optimize() print("Time to optimize the VAE: ", time.time() - time_start) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if update % 2 == 0: self.env.testing = True ob = np.zeros((self.env.num_envs, ) + self.env.observation_space.shape) ob[:] = self.env.reset() total_reward_test = 0 for i in range(1000): print("TESTING") actions = self.step(ob) ob[:], reward, _, _ = self.env.step(actions) total_reward_test += total_reward_test + reward summary2 = tf.Summary(value=[ tf.Summary.Value(tag="episode_reward", simple_value=total_reward_test) ]) self.writer2.add_summary(summary2) else: self.env.testing = False if writer is not None and not self.env.testing: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) logger.logkv( 'ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() return self
def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None,use_action_repeat = False,poisson=False): new_tb_log = self._init_num_timesteps(reset_num_timesteps) self.use_action_repeat=use_action_repeat # self.action_repetition = 0.8 self.running_action_repetition = self.action_repetition self.poisson=poisson self.poisson_action = 4 self.poisson_mean = 4 prev_action = None # self.prob_past = 0.6 #self.env.act_rep-=(21-4)/float(total_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() # if(poisson): # np.concatenate((obs,)) # print(obs) self.episode_reward = np.zeros((1,)) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] self.num_timesteps=0 for step in range(total_timesteps): if poisson: if(self.poisson_mean<1): self.poisson_mean=1 self.poisson_action = int(np.random.poisson(self.poisson_mean)) self.poisson_mean-=((5)/float(total_timesteps)) if(self.poisson_action<1): self.poisson_action=1 if use_action_repeat: # self.action_repetition-=((0.9)/float(total_timesteps)) amount = ((4)/float(total_timesteps)) self.running_action_repetition -= amount # print("Action repetition is :{}".format(self.action_repetition)) if(self.running_action_repetition<=2 and self.running_action_repetition>1): # if(self.action_repetition==4): # print("Flushing replay buffer 4, {}".format(self.action_repetition)) # self.replay_buffer = ReplayBuffer(self.buffer_size) self.action_repetition=2 if(self.running_action_repetition<=1): # if(self.action_repetition==2): # print("Flushing replay buffer 2, {}".format(self.action_repetition)) # self.replay_buffer = ReplayBuffer(self.buffer_size) self.action_repetition=1 # self.action_repetition = (self.action_repetition*amount +self.action_repetition-amount)/(1-amount+amount*self.action_repetition) # if(self.action_repetition<0): # self.action_repetition=0 # self.env.dec_act_rep((21-4)/float(total_timesteps)) # self.running_action_repetition -= ((6-1)/float(total_timesteps)) # self.action_repetition = int(self.running_action_repetition) # if(self.action_repetition<1): # self.action_repetition=1 if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if (self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration): # No need to rescale when sampling random action rescaled_action = action = self.env.action_space.sample() else: if poisson: action = self.policy_tf.step(np.concatenate((obs,np.array([self.poisson_action])))[None], deterministic=False).flatten() else: action = self.policy_tf.step(obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) # if use_action_repeat and prev_action is not None: # if(np.random.uniform(0,1)<self.action_repetition): # rescaled_action=prev_action assert action.shape == self.env.action_space.shape # Add action repetition # print("Action repetition is {}".format(self.action_repetition)) if self.use_action_repeat: repeated_reward = 0 # print("Repeating actions for: {}".format(int(rescaled_action[-1])+4)) for repeat_step in range(int(rescaled_action[-1])+4): prev_action = rescaled_action new_obs, reward, done, info = self.env.step(rescaled_action[:len(rescaled_action)-1]) repeated_reward+=reward buffer_action = action.copy() buffer_action[-1] = (rescaled_action[-1]+4-int(rescaled_action[-1]+4))+repeat_step+1 - 4 # print("Sub actions for: {}".format(buffer_action[-1])) # Add extra supervision # self.replay_buffer.add(obs, action, repeated_reward, new_obs, float(done)) if done: break reward = repeated_reward elif poisson: repeated_reward = 0 # print("Poisson repetition is {}".format(self.poisson_action)) for _ in range(self.poisson_action): # print("Repeating actions for: {}".format(self.action_repetition)) prev_action = rescaled_action new_obs, reward, done, info = self.env.step(rescaled_action) repeated_reward+=reward if done: break reward = repeated_reward else: new_obs, reward, done, info = self.env.step(rescaled_action) # Store transition in the replay buffer. if poisson: self.replay_buffer.add(np.concatenate((obs,np.array([self.poisson_action]))), action, reward, np.concatenate((new_obs,np.array([self.poisson_action]))), float(done)) else: self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append(self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done: prev_action=None if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) self.episode_reward = np.zeros((self.n_envs, )) ep_info_buf = deque(maxlen=100) t_first_start = time.time() # define the victim_model if self.use_explanation: if self.pretrained_mimic: exp_test = GradientExp(self.mimic_model) else: exp_test = None else: exp_test = None nupdates = total_timesteps // self.n_batch obs_list = [] act_list = [] for update in range(1, nupdates + 1): assert self.n_batch % self.nminibatches == 0 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / nupdates lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward, \ obs_oppo, actions_oppo, o_next, o_opp_next, a_opp_next = runner.run() obs_opp_ph = obs_oppo action_oppo_ph = actions_oppo if update % 100 == 0 and self.save_victim_traj: obs_list.append(obs_oppo) act_list.append(actions_oppo) # todo calculate the attention paid on opponent attention = self.calculate_attention(obs_oppo=obs_opp_ph, action_oppo=action_oppo_ph, \ exp_test=exp_test, black_box_att=self.black_box_att, exp_method=self.exp_method) is_stochastic = False ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) slices_hua = (arr[mbinds] for arr in (a_opp_next, o_opp_next, attention)) mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, *slices_hua, is_stochastic=is_stochastic, ratio=self.mix_ratio, writer=writer, update=timestep)) self.num_timesteps += (self.n_batch * self.noptepochs ) // batch_size * update_fac else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) slices_hua = (arr[mb_flat_inds] for arr in (a_opp_next, o_opp_next, is_stochastic, attention)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, *slices_hua, update=timestep, writer=writer, states=mb_states)) self.num_timesteps += (self.n_envs * self.noptepochs ) // envs_per_batch * update_fac loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) # print the attention weights if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'ep_len_mean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break model_file_name = "{0}agent_{1}.pkl".format( self.model_saved_loc, update * self.n_batch) if self.black_box_att: if update % 1000 == 0: print("Model saved at: {}".format(model_file_name)) self.save(model_file_name) else: if update % 1000 == 0: print("Model saved at: {}".format(model_file_name)) self.save(model_file_name) obs_numpy = np.vstack(obs_list) act_numpy = np.vstack(act_list) with open('../saved/trajectory.pkl', 'ab+') as f: pkl.dump([obs_numpy, act_numpy], f, protocol=2) return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) date = datetime.datetime.now().strftime('%Y-%h-%d-%Hh:%Mmin') logger.configure(folder=("Logs/"+date)) logger.info(datetime.datetime.now().strftime('Starting %Y %h %d %Hh %Mmin')) new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam, init=reset_num_timesteps) self.episode_reward = np.zeros((self.n_envs,)) ep_info_buf = deque(maxlen=100) mean_update_time = deque(maxlen=50) l=[] for _ in range(100): l.append({'score': [0,0], 'r': 0, 'l': 0,}) ep_info_buf.extend(l) del l t_first_start = time.time() n_updates = total_timesteps // self.n_batch for update in range(1, n_updates + 1): assert self.n_batch % self.nminibatches == 0 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run() self.num_timesteps += self.n_batch ep_info_buf.extend(ep_infos) mb_loss_vals = [] # print(self.ent_coef) if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ((self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append(self._train_step(lr_now, self.ent_coef, cliprange_now, *slices, writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ((self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append(self._train_step(lr_now, self.ent_coef, cliprange_now, *slices, update=timestep, writer=writer, states=mb_states, cliprange_vf=cliprange_vf_now)) # print(self.ent_coef) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: all_sum = [] t_masks = masks.reshape((self.n_steps, self.n_envs)).transpose() t_rew = true_reward.reshape((self.n_steps, self.n_envs)).transpose() # print(t_masks.shape) # print(t_rew.shape) for i in range(len(t_masks)): t_dones = np.nonzero(t_masks[i]) last_d = 0 for d in t_dones[0]: # print(t_dones) all_sum.append(np.sum(t_rew[i][last_d:(d+1)])) summary = tf.Summary(value=[tf.Summary.Value(tag="episode/reward", simple_value=np.sum(t_rew[i][last_d:d+1]))]) writer.add_summary(summary, self.episodes) self.episodes+=1 last_d = d+1 up_now = self.num_timesteps / self.n_batch if len(all_sum) is 0: for i in range(len(t_masks)): all_sum.append(np.sum(t_rew[i][0:])) # print(all_sum) summary = tf.Summary(value=[tf.Summary.Value(tag="episode/reward_mean", simple_value=np.asarray(all_sum).mean())]) writer.add_summary(summary, up_now) ep_score = np.array([(ep_info['score'][0]-ep_info['score'][1]) for ep_info in ep_info_buf]) summary = tf.Summary(value=[tf.Summary.Value(tag="episode/score_mean", simple_value=ep_score.mean())]) writer.add_summary(summary, up_now) win_rate = np.sum(ep_score > 0)/ep_score.size summary = tf.Summary(value=[tf.Summary.Value(tag="episode/win_rate", simple_value=win_rate)]) writer.add_summary(summary, up_now) draw_rate = np.sum(ep_score == 0)/ep_score.size summary = tf.Summary(value=[tf.Summary.Value(tag="episode/draw_rate", simple_value=draw_rate)]) writer.add_summary(summary, up_now) lose_rate = np.sum(ep_score < 0)/ep_score.size summary = tf.Summary(value=[tf.Summary.Value(tag="episode/lose_rate", simple_value=lose_rate)]) writer.add_summary(summary, up_now) # print(self.ent_coef) if ep_score.mean() > 0.6 and not self.versus_heuristic: del runner.adv_model runner.model.save('adv_ppo2') runner.adv_model = PPO2.load('adv_ppo2') logger.info('Adversary Updated ! ! ! LEVEL UP') # self.episode_reward = total_episode_reward_logger(self.episode_reward, # true_reward.reshape((self.n_envs, self.n_steps)), # masks.reshape((self.n_envs, self.n_steps)), # writer, self.num_timesteps) mean_update_time.append(abs(t_now - t_start)) mean_time = 0 for n in mean_update_time: mean_time += n if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('ep_score_mean', safe_mean([(ep_info['score'][0]-ep_info['score'][1]) for ep_info in ep_info_buf])) logger.logkv('mean_update_time', mean_time/len(mean_update_time)) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="PPO2"): with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) from packing.packing_policy import PackingPolicy if issubclass(self.policy, PackingPolicy): is_packing_env = True else: is_packing_env = False if not is_packing_env: runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam, is_packing_env=is_packing_env) self.episode_reward = np.zeros((self.n_envs,)) ep_info_buf = deque(maxlen=100) t_first_start = time.time() nupdates = total_timesteps // self.n_batch best_avg_eff_va = 0 best_avg_eff_tr = 0 for update in range(nupdates + 1): ########################################################################### # very temporary fix if update % 20 == 0: pack_file_name_va = [ "pack_va/" + str(i) + "_va" for i in range(0, 50)] pack_file_name_tr = [ "pack_tr/" + str(i) + "_tr" for i in range(0, 50)] avg_reward_va, avg_eff_va, per_succ_va, _ = self.evaluate(pack_file_name_va) avg_reward_tr, avg_eff_tr, per_succ_tr, _ = self.evaluate(pack_file_name_tr) log_path = "{}_{}".format(writer.get_logdir(), "log") with open(log_path, "a+") as log_file: log_file.write("Updata Number: {}\n".format(update)) log_file.write("Validation Average Reward: {}\n".format(avg_reward_va)) log_file.write("Validaition Average Efficiency: {}\n".format(avg_eff_va)) log_file.write("Validation Percentage of Success: {}\n\n".format(per_succ_va)) log_file.write("Training Average Reward: {}\n".format(avg_reward_tr)) log_file.write("Training Average Efficiency: {}\n".format(avg_eff_tr)) log_file.write("Training Percentage of Success: {}\n\n".format(per_succ_tr)) if avg_eff_va > best_avg_eff_va: print("Saving best model on validation ....") best_avg_eff_va = avg_eff_va self.save_weights("{}/model_va".format(writer.get_logdir())) if avg_eff_tr > best_avg_eff_tr: print("Saving best model on training ....") best_avg_eff_tr = avg_eff_tr self.save_weights("{}/model_tr".format(writer.get_logdir())) self.save_weights("{}/model_latest".format(writer.get_logdir())) ############################################################################ assert self.n_batch % self.nminibatches == 0 n_batch_train = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update / (nupdates + 1)) lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) # for packing, we start a new env each update step # done so that the code could run smoothly # as otherwise there is memory/resource leakage if is_packing_env and (self.make_env is not None): while True: try: self.env.close() self.env = self.make_env() runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam, is_packing_env=is_packing_env) obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run() self.env.close() except: print("Unable to complete the run.") gc.collect() continue break else: # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run() ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, n_batch_train): timestep = ((update * self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // n_batch_train) end = start + n_batch_train mbinds = inds[start:end] _obs = obs[mbinds] slices = (arr[mbinds] for arr in (returns, masks, actions, values, neglogpacs)) mb_loss_vals.append(self._train_step(lr_now, cliprangenow, _obs, *slices, writer=writer, update=timestep)) else: # recurrent version assert self.n_envs % self.nminibatches == 0 envinds = np.arange(self.n_envs) flatinds = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps) envsperbatch = n_batch_train // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(envinds) for start in range(0, self.n_envs, envsperbatch): timestep = ((update * self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envsperbatch) end = start + envsperbatch mb_env_inds = envinds[start:end] mb_flat_inds = flatinds[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, update=timestep, writer=writer, states=mb_states)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger(self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, update * (self.n_batch + 1)) if callback is not None: callback(locals(), globals()) if self.verbose >= 1 and ((update + 1) % log_interval//100 == 0 or update == 0): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", (update + 1) * self.n_steps) logger.logkv("nupdates", (update + 1)) logger.logkv("total_timesteps", (update + 1) * self.n_batch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() return self
def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="SAC", print_freq=100, save_path=None): with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) start_time = time.time() episode_rewards = [0.0] is_teleop_env = hasattr(self.env, "wait_for_teleop_reset") # TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() else: obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) ep_len = 0 self.n_updates = 0 infos_values = [] mb_infos_vals = [] model_path = "--Path to model--/myNewModdel.h5" # model_path= None if model_path is not None: cfg = dk.load_config( config_path='--Path to config file inside mycar/config.py') kl = KerasLinear() kl.load(model_path) # vae = self.env.get_vae() self.training_started = False self.start_training = False for step in range(total_timesteps): # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy. if step < self.learning_starts and not self.training_started: if model_path is not None: try: img_arr = self.env.get_images() # print(img_arr[0].shape) img_arr = np.asarray(img_arr[0]) img_arr = normalize_and_crop(img_arr, cfg) croppedImgH = img_arr.shape[0] croppedImgW = img_arr.shape[1] if img_arr.shape[2] == 3 and cfg.IMAGE_DEPTH == 1: img_arr = dk.utils.rgb2gray(img_arr).reshape( croppedImgH, croppedImgW, 1) steering, throttle = kl.run(img_arr) action = [steering, throttle / 6.0] action = np.asarray(action) # rescaled_action = action * np.abs(self.action_space.low) rescaled_action = action print('Predicted action :', action) except Exception as e: print(e) action = self.env.action_space.sample() rescaled_action = action else: action = self.env.action_space.sample() rescaled_action = action print(action) # No need to rescale when sampling random action elif not self.training_started: self.start_training = True obs = self.env.reset() else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) ep_len += 1 if print_freq > 0 and ep_len % print_freq == 0 and ep_len > 0: print("{} steps".format(ep_len)) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, step) if ep_len > self.train_freq: print("Additional training") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) done = True episode_rewards[-1] += reward if done or self.start_training: self.start_training = False if not (isinstance(self.env, VecEnv) or is_teleop_env): obs = self.env.reset() print("Episode finished. Reward: {:.2f} {} Steps".format( episode_rewards[-1], ep_len)) episode_rewards.append(0.0) ep_len = 0 mb_infos_vals = self.optimize(step, writer, current_lr) # Refresh obs when using TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv( 'ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", self.n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', "{:.2f}".format(time.time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", step) logger.dumpkvs() # Reset infos: infos_values = [] if is_teleop_env: self.env.is_training = False # Use last batch print("Final optimization before saving") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) plt.figure(1) plt.plot(episode_rewards) plt.title('Episode Rewards') plt.ylabel("Reward") plt.xlabel("Epoch") filename = "training" + str(random.random()) + ".png" plt.savefig(filename) plt.show() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="SAC", print_freq=100): with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) start_time = time.time() episode_rewards = [0.0] is_teleop_env = hasattr(self.env, "wait_for_teleop_reset") # TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() info = {"cte": 0.0} else: obs = self.env.reset() info = {"cte": 0.0} self.episode_reward = np.zeros((1,)) ep_info_buf = deque(maxlen=100) ep_len = 0 self.n_updates = 0 infos_values = [] mb_infos_vals = [] # ---------------------load the trained NN for safety signal tf_obs = tf.placeholder(tf.float32, obs.shape) hidden1 = tf.layers.dense(tf_obs, 64, tf.nn.relu) hidden2 = tf.layers.dense(hidden1, 16, tf.nn.relu) output = tf.layers.dense(hidden2, 2) sess = tf.Session() saver = tf.train.Saver() saver.restore(sess, "./saved_params/param02-level1-linear/safe_layer") # -------------------------------------------------------- fr = open("dump_reward.txt", "w") fv = open("dump_violation.txt", "w") fl = open("dump_lambda.txt", "w") cum_reward = [] num_vio = 0 for step in range(total_timesteps): # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy. if step < self.learning_starts: action = self.env.action_space.sample() # No need to rescale when sampling random action rescaled_action = action else: action = self.policy_tf.step(obs[None], deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape # ---------- use trained NN to revise the action #print("-------------------------------") print("h1, rescaled_action ", rescaled_action) proposed_action = rescaled_action.copy() proposed_action = np.asarray(proposed_action).reshape((1, 2)) #print ("h2, proposed_action", proposed_action) print("obs shape: ", obs.shape) corr_v = sess.run(output, {tf_obs: obs}) lambda_v = (info["cte"] + np.dot(corr_v, proposed_action.T) - 1.3) / np.dot(corr_v, corr_v.T) #print (info["cte"], info["cte"] + np.asscalar(np.dot(corr_v, proposed_action.T)) ) print("lambda: ", lambda_v) if lambda_v < 0: lambda_v = 0.0 proposed_action -= lambda_v * corr_v proposed_action *= np.abs(self.action_space.low) #print ("h3 proposed_action: ", proposed_action) #print("h4 rescaled_action: ", rescaled_action) rescaled_action[0] = proposed_action[0][0] rescaled_action[1] = proposed_action[0][1] # ----------------------------------------- print("h5 rescaled_action: ", rescaled_action) new_obs, reward, done, new_info = self.env.step(rescaled_action) ep_len += 1 if (len(cum_reward) == 10): cum_reward.pop(0) cum_reward.append(reward) curr = 0.0 for i in range(len(cum_reward)): idx = len(cum_reward) - i - 1 curr += cum_reward[idx] * (0.99**i) fr.write("%f \n" %(curr)) fv.write("%d \n" %(num_vio)) fl.write("%f \n" %(lambda_v)) if print_freq > 0 and ep_len % print_freq == 0 and ep_len > 0: print("{} steps".format(ep_len)) # Store transition in the replay buffer. self.replay_buffer.add(obs, rescaled_action, reward, new_obs, float(done)) obs = new_obs info = new_info # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward, ep_done, writer, step) if ep_len > self.train_freq: print("Additional training") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) done = True episode_rewards[-1] += reward if done: num_vio += 1 if not (isinstance(self.env, VecEnv) or is_teleop_env): obs = self.env.reset() print("Episode finished. Reward: {:.2f} {} Steps".format(episode_rewards[-1], ep_len)) episode_rewards.append(0.0) ep_len = 0 mb_infos_vals = self.optimize(step, writer, current_lr) # Refresh obs when using TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", self.n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', "{:.2f}".format(time.time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", step) logger.dumpkvs() # Reset infos: infos_values = [] if is_teleop_env: self.env.is_training = False # Use last batch print("Final optimization before saving") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) fr.close() fv.close() fl.close() return self
def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] trajectory = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() n_updates = 0 infos_values = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() for step in range(total_timesteps): # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if self.num_timesteps < self.learning_starts or np.random.rand( ) < self.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.action_space, unscaled_action) else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # inferred actions need to be transformed to environment action_space before stepping unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(unscaled_action) self.num_timesteps += 1 # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs( ).squeeze() reward_ = self._vec_normalize_env.get_original_reward( ).squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, reward # Store transition in the replay buffer. self.replay_buffer.add(obs_, action, reward_, new_obs_, float(done)) trajectory.append( Experience(obs_, action, new_obs_, reward_, float(done))) obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: self.ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: callback.on_rollout_end() mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) callback.on_rollout_start() episode_rewards[-1] += reward_ if done: if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): self.full_buffer.add_trajectory(trajectory) trajectory = [] obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) # Display training infos if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'eplenmean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) summary = tf.Summary(value=[ tf.Summary.Value(tag='episode_reward/success_rate', simple_value=np.mean( episode_successes[-100:])) ]) writer.add_summary(summary, self.num_timesteps) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] from pathlib import Path base_path = Path(self.tensorboard_log) src_path = str(base_path) dst_path = str(base_path.parent / '{}'.format(base_path.stem + '_copy')) import os os.system("cp -rf {} {}".format(str(base_path), dst_path)) base_path = Path(self.log_dir) / 'progress.csv' copy_path = str(base_path.parent / '{}{}'.format( base_path.stem + '_copy', base_path.suffix)) import shutil shutil.copy(str(base_path), copy_path) if step % 100000 == 0: self.full_buffer.save(self.buffer_log + 'sub_task_{}.hdf5'.format(0)) callback.on_training_end() self.full_buffer.save(self.buffer_log + 'sub_task_{}.hdf5'.format(0)) return self
def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): """ Just copied from the stable_baselines.ppo2 implementation. Goal is to change some parts of it later. """ # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() t_first_start = time.time() n_updates = total_timesteps // self.n_batch callback.on_training_start(locals(), globals()) for update in range(1, n_updates + 1): minibatch_size = cfg.minibatch_size # self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) callback.on_rollout_start() # try getting rollout 3 times tried_rollouts = 0 while tried_rollouts < 1: try: # true_reward is the reward without discount rollout = self.runner.run(callback) break except BrokenPipeError as bpe: raise BrokenPipeError(f'Catched Broken Pipe Error.') except Exception as ex: # tried_rollouts += 1 # obs, returns, masks, actions, values, neglogpacs, \ # states, ep_infos, true_reward = rollout # log(f'Rollout failed {tried_rollouts} times!', # [f'Catched exception: {ex}', # f'obs.shape: {obs.shape}', # f'ret.shape: {returns.shape}']) traceback.print_exc() # if isinstance(ex, BrokenPipeError): # # copy-pasted from the old blog here: # # http://newbebweb.blogspot.com/2012/02/python-head-ioerror-errno-32-broken.html # from signal import signal, SIGPIPE, SIG_DFL # signal(SIGPIPE, SIG_DFL) # print('Executing fix: Importing signal and disabling BrokenPipeError.') # for _ in range(10000): # print('', end='') # reset count once, rollout was successful tried_rollouts = 0 # Unpack if self.mirror_experiences: obs, returns, masks, actions, values, neglogpacs, \ states, ep_infos, true_reward = mirror_experiences(rollout, self) elif cfg.is_mod(cfg.MOD_EXP_REPLAY): obs, returns, masks, actions, values, neglogpacs, \ states, ep_infos, true_reward = self.exp_replay(rollout) else: obs, returns, masks, actions, values, neglogpacs, \ states, ep_infos, true_reward = rollout self.last_actions = actions if np.random.randint(low=1, high=20) == 7: log(f'Values and Returns of collected experiences: ', [ f'min returns:\t{np.min(returns)}', f'min values:\t\t{np.min(values)}', f'mean returns:\t{np.mean(returns)}', f'mean values:\t{np.mean(values)}', f'max returns:\t{np.max(returns)}', f'max values:\t\t{np.max(values)}' ]) if cfg.is_mod(cfg.MOD_REFS_REPLAY): # load ref experiences and treat them as real experiences obs, actions, returns, masks, values, neglogpacs = \ generate_experiences_from_refs(rollout, self.ref_obs, self.ref_acts) callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break self.ep_info_buf.extend(ep_infos) mb_loss_vals = [] self.n_batch = obs.shape[0] self.nminibatches = self.n_batch / minibatch_size if self.n_batch % minibatch_size != 0: log("CAUTION!", [ 'Last minibatch might be too small!', f'Batch Size: \t{self.n_batch}', f'Minibatch Size:\t{minibatch_size}', f'Modulo: \t\t {self.n_batch % minibatch_size}' ]) if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) n_epochs = self.noptepochs for epoch_num in range(n_epochs): np.random.shuffle(inds) for start in range(0, self.n_batch, minibatch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // minibatch_size) end = start + minibatch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = minibatch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, update=timestep, writer=writer, states=mb_states, cliprange_vf=cliprange_vf_now)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'ep_len_mean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_log_name="CLAC", reset_num_timesteps=True, randomization=0): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] learning_results = pd.DataFrame() obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] reward_data = pd.DataFrame() for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy. if self.num_timesteps < self.learning_starts: if (isinstance(self.env.action_space, Discrete)): action = [] for _ in range(self.env.action_space.n): action.append(1 / self.env.action_space.n) rescaled_action = self.env.action_space.sample() else: action = self.env.action_space.sample() # No need to rescale when sampling random action rescaled_action = action else: if (isinstance(self.env.action_space, Discrete)): actions = list(range(self.env.action_space.n)) action = self.policy_tf.step( obs[None], deterministic=False).flatten() rescaled_action = np.random.choice(actions, 1, p=action)[0] else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs( self.action_space.low) if (not isinstance(self.env.action_space, Discrete)): assert action.shape == self.env.action_space.shape # If coinrunner environment # rescaled_action = np.array(rescaled_action, ndmin=1) new_obs, reward, done, info = self.env.step(rescaled_action) act_mu, act_std = self.policy_tf.proba_step(obs[None]) if (len(act_std) == 1): act_std = act_std[0] #print("ACT MU FROM PROBA STEP", act_mu) #print("ACT STD FROM PROBA STEP", act_std) if self.num_timesteps > self.learning_starts: # Only update marginal approximation after learning starts is completed if (self.multivariate_mean is None): self.multivariate_mean = act_mu else: previous_mean = self.multivariate_mean self.multivariate_mean = ( (1 - self.learning_rate_phi) * self.multivariate_mean) + (self.learning_rate_phi * act_mu) if (self.multivariate_cov is None): self.multivariate_cov = np.diag(act_std) else: cov = (self.learning_rate_phi * np.diag(act_std) + (1 - self.learning_rate_phi) * self.multivariate_cov) mom_1 = (self.learning_rate_phi * np.square(np.diag(act_mu))) + ( (1 - self.learning_rate_phi) * np.square(np.diag(previous_mean))) mom_2 = np.square((self.learning_rate_phi * np.diag(act_mu)) + (1 - self.learning_rate_phi) * np.diag(previous_mean)) self.multivariate_cov = cov + mom_1 - mom_2 # Update Beta parameter if coef_schedule is set if (self.coef_schedule is not None and self.mut_inf_coef > 1e-12): # (1 - a) B + a(1/L()) # Loss based update schdule, for later # Currently using linear schedule: self.mut_inf_coef *= (1 - self.coef_schedule) """if(self.num_timesteps % 1000 == 0): print("updated mut_inf_coef: ", self.mut_inf_coef, " at time step ", self.num_timesteps)""" # Store transition in the replay buffer. #print("adding action to replay buffer: ", action) self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper # info = info[0] maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): if self.num_timesteps < self.batch_size or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: for mb_info_val in mb_infos_vals: for mb_info in mb_info_val: if mb_info is not None: infos_values.append(np.mean(mb_info)) #infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done: if not isinstance(self.env, VecEnv): obs = self.env.reset() if (randomization == 1): try: for env in self.env.unwrapped.envs: env.randomize() except: print( "Trying to randomize an environment that is not set up for randomization, check environment file" ) assert (False) if (randomization == 2): try: for env in self.env.unwrapped.envs: env.randomize_extreme() except: print( "Trying to extremely randomize an environment that is not set up for randomization, check environment file" ) assert (False) Model_String = "CLAC" if not self.auto_mut_inf_coef: Model_String = "CLAC " + str(self.mut_inf_coef) env_name = self.env.unwrapped.envs[0].spec.id mut_inf_coef = self.init_mut_inf_coef if (type(self.mut_inf_coef) == tf.Tensor or np.isnan(mut_inf_coef)): mut_inf_coef = "auto" Model_String = "CLAC" + str(mut_inf_coef) d = { 'Episode Reward': episode_rewards[-1], 'Coefficient': mut_inf_coef, 'Timestep': self.num_timesteps, 'Episode Number': len(episode_rewards) - 1, 'Env': env_name, 'Randomization': randomization, 'Model': "CLAC" } learning_results = learning_results.append( d, ignore_index=True) self.tf_logged_reward = episode_rewards[-1] episode_rewards.append(0.0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv( 'ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return (self, learning_results)
def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() t_first_start = time.time() n_updates = total_timesteps // self.n_batch callback.on_training_start(locals(), globals()) for update in range(1, n_updates + 1): assert self.n_batch % self.nminibatches == 0, ( "The number of minibatches (`nminibatches`) " "is not a factor of the total number of samples " "collected per rollout (`n_batch`), " "some samples won't be used.") batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) callback.on_rollout_start() # true_reward is the reward without discount rollout = self.runner.run(callback) # Unpack obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = rollout callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break self.ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, update=timestep, writer=writer, states=mb_states, cliprange_vf=cliprange_vf_now)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'ep_len_mean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] tra_obs = [] ep_count = 0 selected_goal = None tra_count = 0 for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if (self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration): # No need to rescale when sampling random action rescaled_action = action = self.env.action_space.sample() else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) ################################################################# # fit density model and update goal proposing model skew_explore_obs = obs.copy() if isinstance(self.env, HERGoalEnvWrapper): skew_explore_obs_dict = self.env.convert_obs_to_dict( skew_explore_obs) skew_explore_obs = np.array( [skew_explore_obs_dict['observation']]) tra_obs.append(skew_explore_obs[0]) if selected_goal is None: selected_goal = np.array( skew_explore_obs_dict['desired_goal']) else: tra_obs.append(skew_explore_obs) self.skew_explore.update_history(skew_explore_obs, [done]) if (step % self.goal_update_frequency == 0 and step != 0) or step == 2000: logging.info('update buffer') self.skew_explore.activate_buffer() ################################################################# # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): if self.num_timesteps < self.batch_size or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done: self.plot_tra(tra_count, tra_obs, selected_goal) tra_obs = [] selected_goal = None if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() ep_count += 1 episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) tra_count += 1 self.save(self.args.save_path + '/model') if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2", eval_every_n=5, reset_num_timesteps=True, record_video=False, log_dir=""): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) self.episode_reward = np.zeros((self.n_envs, )) ep_info_buf = deque(maxlen=100) t_first_start = time.time() nupdates = total_timesteps // self.n_batch for update in range(1, nupdates + 1): if update % eval_every_n == 1: print("[RAISIM_GYM] Visualizing in RaiSimOgre") obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = \ runner.run(test_mode=True, record_video=record_video, video_name=log_dir+"/"+str(update-1)+".mp4") print("Average rewards in this test episode ", ep_infos[0]['r']) # tensorboard_log(logger, ep_infos, self.sess) assert self.n_batch % self.nminibatches == 0 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / nupdates lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run( ) ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, writer=writer, update=timestep)) self.num_timesteps += (self.n_batch * self.noptepochs ) // batch_size * update_fac else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, update=timestep, writer=writer, states=mb_states)) self.num_timesteps += (self.n_envs * self.noptepochs ) // envs_per_batch * update_fac loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'ep_len_mean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="SAC", print_freq=100): with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) start_time = time.time() episode_rewards = [0.0] is_teleop_env = hasattr(self.env, "wait_for_teleop_reset") # TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() info = {"cte": 0.0} else: obs = self.env.reset() info = {"cte": 0.0} self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) ep_len = 0 self.n_updates = 0 infos_values = [] mb_infos_vals = [] # ---------------------load the trained NN for safety signal tf_obs = tf.placeholder(tf.float32, shape=(1, 104)) hidden1 = tf.layers.dense(tf_obs, 64, tf.nn.relu) hidden2 = tf.layers.dense(hidden1, 16, tf.nn.relu) output1 = tf.layers.dense(hidden2, 2) hidden3 = tf.layers.dense(tf_obs, 64, tf.nn.relu) hidden4 = tf.layers.dense(hidden3, 16, tf.nn.relu) output2 = tf.layers.dense(hidden4, 3) sess = tf.Session() saver = tf.train.Saver() saver.restore(sess, "./saved_params/param03-level1-quad/safe_layer") # -------------------------------------------------------- fr = open("dump_reward.txt", "w") fv = open("dump_violation.txt", "w") cum_reward = [] num_vio = 0 for step in range(total_timesteps): # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy. if step < self.learning_starts: action = self.env.action_space.sample() # No need to rescale when sampling random action rescaled_action = action else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape # ---------- use trained NN to revise the action if action[1] < 0: action[1] *= -1 print("h1, action ", action) proposed_action = action.copy() action_take = action.copy() proposed_action = np.asarray(proposed_action).reshape((1, 2)) #print ("h2, proposed_action", proposed_action) #print("obs shape", obs.shape) v1 = sess.run(output1, {tf_obs: obs.reshape((1, 104))}) v2 = sess.run(output2, {tf_obs: obs.reshape((1, 104))}) q = [v2[0][0], 0.5 * v2[0][1], 0.5 * v2[0][1], v2[0][2]] q = np.reshape(q, (2, 2)) x = cvx.Variable(1, 2) obj = cvx.sum_squares(x - proposed_action) cons = [info["cte"] + v1 * x.T + x * q * x.T <= 4.8, x[1] > 0] prob = cvx.Problem(cvx.Minimize(obj), cons) try: qcqp = QCQP(prob) qcqp.suggest(SDR) f_cd, v_cd = qcqp.improve(COORD_DESCENT) print( "Coordinate descent: objective %.3f, violation %.3f" % (f_cd, v_cd)) if v_cd == 0: new_action = x.value new_action = np.asarray(new_action).reshape((1, 2)) print("h5, action ", new_action) action_take[0] = new_action[0][0] action_take[1] = new_action[0][1] new_obs, reward, done, new_info = self.env.step( action_take) action = action_take else: new_obs, reward, done, new_info = self.env.step(action) except: new_obs, reward, done, new_info = self.env.step(action) # ----------------------------------------- ep_len += 1 if (len(cum_reward) == 10): cum_reward.pop(0) cum_reward.append(reward) curr = 0.0 for i in range(len(cum_reward)): idx = len(cum_reward) - i - 1 curr += cum_reward[idx] * (0.99**i) fr.write("%f \n" % (curr)) fv.write("%d \n" % (num_vio)) if print_freq > 0 and ep_len % print_freq == 0 and ep_len > 0: print("{} steps".format(ep_len)) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs info = new_info # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, step) if ep_len > self.train_freq: print("Additional training") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) done = True episode_rewards[-1] += reward if done: num_vio += 1 if not (isinstance(self.env, VecEnv) or is_teleop_env): obs = self.env.reset() print("Episode finished. Reward: {:.2f} {} Steps".format( episode_rewards[-1], ep_len)) episode_rewards.append(0.0) ep_len = 0 mb_infos_vals = self.optimize(step, writer, current_lr) # Refresh obs when using TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv( 'ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", self.n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', "{:.2f}".format(time.time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", step) logger.dumpkvs() # Reset infos: infos_values = [] if is_teleop_env: self.env.is_training = False # Use last batch print("Final optimization before saving") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] obs = self.env.reset() self.episode_reward = np.zeros((1,)) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] obs = self.env.reset() for i in range(128): action = self.env.action_space.sample() new_obs, reward, done, info = self.env.step(action) # print(new_obs) # self.env.render() self.iiayn.update_history([obs]) obs = new_obs for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy. if self.num_timesteps < self.learning_starts: action = self.env.action_space.sample() # No need to rescale when sampling random action rescaled_action = action else: action = self.policy_tf.step(obs[None], deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action #* np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) print(step, action) # self.env.render() self.iiayn.update_history([obs]) if step % 2048 == 0: self.iiayn.activate_buffer() # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): if self.num_timesteps < self.batch_size or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append(self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done or step%1024 == 0: obs = self.env.reset() # if not isinstance(self.env, VecEnv): # obs = self.env.reset() episode_rewards.append(0.0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True, save_interval=None, save_path=None, gamma=0.99, n_steps=128): print('----------------------------------------------') print('| L E A R N |') print('----------------------------------------------') print("num timesteps = " + str(int(total_timesteps / 1000000)) + 'm') # print("num_envs = ", self.num_envs) print("save_interval = " + str(int(save_interval / 1000)) + 'k') print() save_interval_st = save_interval self.gamma = gamma self.n_steps = n_steps # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # 去掉参数 seed ? runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) hindsight_buffer = HindSightBuffer(self.n_steps, self.gamma, self.lam) self.episode_reward = np.zeros((self.n_envs, )) self.win_rate = np.zeros((self.n_envs, )) self.tie_rate = np.zeros((self.n_envs, )) self.loss_rate = np.zeros((self.n_envs, )) # ep_info_buf = deque(maxlen=100) t_first_start = time.time() n_updates = total_timesteps // self.n_batch # self.n_batch = self.n_envs(8) * self.n_steps(128) for update in range(1, n_updates + 1): assert self.n_batch % self.nminibatches == 0 # self.nminibatches == 4 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, true_reward, \ win_rates, tie_rates, loss_rates, obs_nf = runner.run() self.num_timesteps += self.n_batch # ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, update=timestep, writer=writer, states=mb_states, cliprange_vf=cliprange_vf_now)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, 2 * self.n_steps)), masks.reshape((self.n_envs, 2 * self.n_steps)), writer, self.num_timesteps) self.win_rate = total_rate_logger( self.win_rate, win_rates.reshape((self.n_envs, self.n_steps)), masks[:5120].reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps, name='win_rate') self.tie_rate = total_rate_logger( self.tie_rate, tie_rates.reshape((self.n_envs, self.n_steps)), masks[:5120].reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps, name='tie_rate') self.loss_rate = total_rate_logger( self.loss_rate, loss_rates.reshape((self.n_envs, self.n_steps)), masks[:5120].reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps, name='loss_rate') if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) # if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: # logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) # logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # save interval if self.num_timesteps >= save_interval_st: save_interval_st += save_interval s_path = save_path + '_' + str( int(self.num_timesteps / 10000)) + 'k.zip' self.save(save_path=s_path) return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="PPO2_SH"): with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam, visualize=self.visualize, snapshot_details=self.snapshot_details) self.episode_reward = np.zeros((self.n_envs, )) ep_info_buf = deque(maxlen=100) t_first_start = time.time() nupdates = total_timesteps // self.n_batch for update in range(nupdates + 1): assert self.n_batch % self.nminibatches == 0 n_batch_train = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update / (nupdates + 1)) lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run( ) ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, n_batch_train): timestep = ( (update * self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // n_batch_train) end = start + n_batch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, writer=writer, update=timestep)) else: # recurrent version assert self.n_envs % self.nminibatches == 0 envinds = np.arange(self.n_envs) flatinds = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envsperbatch = n_batch_train // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(envinds) for start in range(0, self.n_envs, envsperbatch): timestep = ( (update * self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envsperbatch) end = start + envsperbatch mb_env_inds = envinds[start:end] mb_flat_inds = flatinds[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, update=timestep, writer=writer, states=mb_states)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, update * (self.n_batch + 1)) all_env_episode_rewards = calculate_total_episode_reward( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps))) average_episode_reward = safe_mean(all_env_episode_rewards) ep_info = {'r': average_episode_reward, 'l': np.nan} ep_info_buf.append(ep_info) if callback is not None: callback(locals(), globals()) if self.verbose >= 1 and ( (update + 1) % log_interval // 100 == 0 or update == 0): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", (update + 1) * self.n_steps) logger.logkv("nupdates", (update + 1)) logger.logkv("total_timesteps", (update + 1) * self.n_batch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) logger.logkv( 'ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() return self
def learn(self, total_timesteps, env, callback=None, seed=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True, save_file="default"): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) runner = OverideRunner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) self.episode_reward = np.zeros((self.n_envs, )) ep_info_buf = deque(maxlen=100) t_first_start = time.time() nupdates = total_timesteps // self.n_batch print("No of updates: {}".format(nupdates)) print("Total timesteps : {}".format(total_timesteps)) print("Batch size: {}".format(self.n_batch)) for update in range(1, nupdates + 1): assert self.n_batch % self.nminibatches == 0 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / nupdates # frac = 1.0 lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run( ) ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, writer=writer, update=timestep)) self.num_timesteps += (self.n_batch * self.noptepochs ) // batch_size * update_fac if (update * self.n_batch) % 8192 == 0: self.save(save_file + str(update * self.n_batch)) # plot_policy_and_value_fns(self, update * self.n_batch, save_file.split('ppo2_me')[0] + 'policy_plots/') # total_reward, success_episodes = self.test(env) # env.logger.log_scalar('test/success_episodes', success_episodes, update * self.n_batch) # env.logger.log_scalar('test/total_reward', total_reward, update * self.n_batch) # total_rewards.append(total_reward) # total_successes.append(success_episodes) else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, update=timestep, writer=writer, states=mb_states)) self.num_timesteps += (self.n_envs * self.noptepochs ) // envs_per_batch * update_fac loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'ep_len_mean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break return self
def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if self.num_timesteps < self.learning_starts or np.random.rand( ) < self.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.action_space, unscaled_action) else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # inferred actions need to be transformed to environment action_space before stepping unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(unscaled_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done: if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv("episode reward", episode_rewards[-2]) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="TD3", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) last_replay_update = 0 if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) if isinstance(self.train_freq, tuple): # TODO: bug with optuna please FIX self.train_freq = self.train_freq[0] self.gradient_steps = self.gradient_steps[0] with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() n_updates = 0 infos_values = [] self.active_sampling = False initial_step = self.num_timesteps episode_data = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() if self.buffer_is_prioritized and \ ((replay_wrapper is not None and self.replay_buffer.replay_buffer.__name__ == "ReplayBuffer") or (replay_wrapper is None and self.replay_buffer.__name__ == "ReplayBuffer")) \ and self.num_timesteps >= self.prioritization_starts: self._set_prioritized_buffer() if self.recurrent_policy: done = False policy_state = self.policy_tf_act.initial_state prev_policy_state = self.policy_tf_act.initial_state # Keep track of this so it doesnt have to be recalculated when saving it to replay buffer for step in range(initial_step, total_timesteps): # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.action_space, unscaled_action) else: if self.recurrent_policy: action, policy_state = self.policy_tf_act.step(obs[None], state=policy_state, mask=np.array(done)[None]) action = action.flatten() else: action = self.policy_tf.step(obs[None]).flatten() # Add noise to the action, as the policy # is deterministic, this is required for exploration if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(unscaled_action) self.num_timesteps += 1 # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs().squeeze() reward_ = self._vec_normalize_env.get_original_reward().squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, reward if self.reward_transformation is not None: reward = self.reward_transformation(reward) # Store transition in the replay buffer. extra_data = {} if self.time_aware: bootstrap = True if done: info_time_limit = info.get("TimeLimit.truncated", None) bootstrap = info.get("termination", None) == "steps" or \ (info_time_limit is not None and info_time_limit) extra_data["bootstrap"] = bootstrap if hasattr(self.policy, "collect_data"): if self.recurrent_policy: extra_data.update(self.policy_tf_act.collect_data(locals(), globals())) if self.policy_tf.save_target_state: extra_data.update({"target_" + state_name: self.target_policy_tf.initial_state[0, :] for state_name in (["state"] if self.target_policy_tf.share_lstm else ["pi_state", "qf1_state", "qf2_state"])}) else: extra_data.update(self.policy_tf.collect_data(locals(), globals())) self.replay_buffer.add(obs, action, reward, new_obs, done, **extra_data) # Extra data must be sent as kwargs to support separate bootstrap and done signals (needed for HER style algorithms) episode_data.append({"obs": obs, "action": action, "reward": reward, "obs_tp1": new_obs, "done": done, **extra_data}) obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ if ((replay_wrapper is not None and self.replay_buffer.replay_buffer.__name__ == "RankPrioritizedReplayBuffer")\ or self.replay_buffer.__name__ == "RankPrioritizedReplayBuffer") and \ self.num_timesteps % self.buffer_size == 0: self.replay_buffer.rebalance() # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None and self.num_timesteps >= self.learning_starts: self.ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger(self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if self.num_timesteps % self.train_freq == 0: callback.on_rollout_end() mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - self.num_timesteps / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) # Note: the policy is updated less frequently than the Q functions # this is controlled by the `policy_delay` parameter step_writer = writer if grad_step % self.write_freq == 0 else None mb_infos_vals.append( self._train_step(step, step_writer, current_lr, (step + grad_step) % self.policy_delay == 0)) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) callback.on_rollout_start() episode_rewards[-1] += reward if self.recurrent_policy: prev_policy_state = policy_state if done: if isinstance(self.replay_buffer, DiscrepancyReplayBuffer) and n_updates - last_replay_update >= 5000: self.replay_buffer.update_priorities() last_replay_update = n_updates if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): if self.active_sampling: sample_obs, sample_state = self.env.get_random_initial_states(25) obs_discrepancies = self.policy_tf.get_q_discrepancy(sample_obs) obs = self.env.reset(**sample_state[np.argmax(obs_discrepancies)]) else: obs = self.env.reset() episode_data = [] episode_rewards.append(0.0) if self.recurrent_policy: prev_policy_state = self.policy_tf_act.initial_state maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 if self.buffer_is_prioritized and \ ((replay_wrapper is not None and self.replay_buffer.replay_buffer.__name__ == "ReplayBuffer") or (replay_wrapper is None and self.replay_buffer.__name__ == "ReplayBuffer"))\ and self.num_timesteps >= self.prioritization_starts: self._set_prioritized_buffer() # Display training infos if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0: logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] callback.on_training_end() return self