def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="A2C"): with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) self.learning_rate_schedule = Scheduler( initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) runner = A2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma) self.episode_reward = np.zeros((self.n_envs, )) t_start = time.time() for update in range(1, total_timesteps // self.n_batch + 1): # true_reward is the reward without discount obs, states, rewards, masks, actions, values, true_reward = runner.run( ) _, value_loss, policy_entropy = self._train_step( obs, states, rewards, masks, actions, values, update, writer) n_seconds = time.time() - t_start fps = int((update * self.n_batch) / n_seconds) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, update * (self.n_batch + 1)) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) == False: break if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * self.n_batch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(explained_var)) logger.dump_tabular() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="A2C", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) runner = A2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma) self.episode_reward = np.zeros((self.n_envs,)) # Training stats (when using Monitor wrapper) ep_info_buf = deque(maxlen=100) t_start = time.time() for update in range(1, total_timesteps // self.n_batch + 1): # true_reward is the reward without discount obs, states, rewards, masks, actions, values, ep_infos, true_reward = runner.run() ep_info_buf.extend(ep_infos) _, value_loss, policy_entropy = self._train_step(obs, states, rewards, masks, actions, values, self.num_timesteps // self.n_batch, writer) n_seconds = time.time() - t_start fps = int((update * self.n_batch) / n_seconds) if writer is not None: self.episode_reward = total_episode_reward_logger(self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) self.num_timesteps += self.n_batch if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", self.num_timesteps) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(explained_var)) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.dump_tabular() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="SIL_A2C"): with SetVerbosity(self.verbose), \ TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: # type: tf.summary.FileWriter self._setup_learn(seed) self.save_directory = Path(writer.get_logdir()) self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) runner = SuccessorFeatureA2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma) self.episode_reward = np.zeros((self.n_envs,)) t_start = time.time() for update in range(1, total_timesteps // self.n_batch + 1): # true_reward is the reward without discount obs, states, rewards, masks, actions, values, true_reward, raw_rewards, features, reward_bonuses = runner.run() _, value_loss, policy_entropy, sf_loss = self._train_step(obs, states, rewards, masks, actions, values, update, writer, features=features, rewards_bonuses=reward_bonuses) sil_loss, sil_adv, sil_samples, sil_nlogp = self._train_sil() n_seconds = time.time() - t_start fps = int((update * self.n_batch) / n_seconds) if writer is not None: self.episode_reward = total_episode_reward_logger(self.episode_reward, raw_rewards.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, update * (self.n_batch + 1)) summary = tf.Summary(value=[tf.Summary.Value( tag="episode_reward/best_reward", simple_value=self.sil.get_best_reward())]) writer.add_summary(summary, update * (self.n_batch + 1)) if callback is not None: callback(locals(), globals()) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * self.n_batch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular('sf_loss', float(sf_loss)) logger.record_tabular("explained_variance", float(explained_var)) logger.record_tabular("best_episode_reward", float(self.sil.get_best_reward())) if self.sil_update > 0: logger.record_tabular("sil_num_episodes", float(self.sil.num_episodes())) logger.record_tabular("sil_valid_samples", float(sil_samples)) logger.record_tabular("sil_steps", float(self.sil.num_steps())) logger.dump_tabular() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN"): with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] obs = self.env.reset() reset = True self.episode_reward = np.zeros((1, )) for step in range(total_timesteps): if callback is not None: callback(locals(), globals()) # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(step) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(step) + self.exploration.value(step) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = self.env.step(env_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, step) episode_rewards[-1] += rew if done: if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True if step > self.learning_starts and step % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(step)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + step) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % step) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, step) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) if step > self.learning_starts and step % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", step) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(step))) logger.dump_tabular() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="PPO2_SH"): with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam, visualize=self.visualize, snapshot_details=self.snapshot_details) self.episode_reward = np.zeros((self.n_envs, )) ep_info_buf = deque(maxlen=100) t_first_start = time.time() nupdates = total_timesteps // self.n_batch for update in range(nupdates + 1): assert self.n_batch % self.nminibatches == 0 n_batch_train = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update / (nupdates + 1)) lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run( ) ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, n_batch_train): timestep = ( (update * self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // n_batch_train) end = start + n_batch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, writer=writer, update=timestep)) else: # recurrent version assert self.n_envs % self.nminibatches == 0 envinds = np.arange(self.n_envs) flatinds = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envsperbatch = n_batch_train // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(envinds) for start in range(0, self.n_envs, envsperbatch): timestep = ( (update * self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envsperbatch) end = start + envsperbatch mb_env_inds = envinds[start:end] mb_flat_inds = flatinds[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, update=timestep, writer=writer, states=mb_states)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, update * (self.n_batch + 1)) all_env_episode_rewards = calculate_total_episode_reward( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps))) average_episode_reward = safe_mean(all_env_episode_rewards) ep_info = {'r': average_episode_reward, 'l': np.nan} ep_info_buf.append(ep_info) if callback is not None: callback(locals(), globals()) if self.verbose >= 1 and ( (update + 1) % log_interval // 100 == 0 or update == 0): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", (update + 1) * self.n_steps) logger.logkv("nupdates", (update + 1)) logger.logkv("total_timesteps", (update + 1) * self.n_batch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) logger.logkv( 'ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="TRPO", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) with self.sess.as_default(): seg_gen = traj_segment_generator( self.policy_pi, self.env, self.timesteps_per_batch, reward_giver=self.reward_giver, gail=self.using_gail) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 t_start = time.time() len_buffer = deque( maxlen=40) # rolling buffer for episode lengths reward_buffer = deque( maxlen=40) # rolling buffer for episode rewards self.episode_reward = np.zeros((self.n_envs, )) true_reward_buffer = None if self.using_gail: true_reward_buffer = deque(maxlen=40) # Initialize dataloader batchsize = self.timesteps_per_batch // self.d_step self.expert_dataset.init_dataloader(batchsize) # Stats not used for now # TODO: replace with normal tb logging # g_loss_stats = Stats(loss_names) # d_loss_stats = Stats(reward_giver.loss_name) # ep_stats = Stats(["True_rewards", "Rewards", "Episode_length"]) while True: if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break if total_timesteps and timesteps_so_far >= total_timesteps: break logger.log("********** Iteration %i ************" % iters_so_far) def fisher_vector_product(vec): return self.allmean( self.compute_fvp( vec, *fvpargs, sess=self.sess)) + self.cg_damping * vec # ------------------ Update G ------------------ logger.log("Optimizing Policy...") # g_step = 1 when not using GAIL mean_losses = None vpredbefore = None tdlamret = None observation = None action = None seg = None for k in range(self.g_step): with self.timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, self.gamma, self.lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) observation, action, atarg, tdlamret = seg["ob"], seg[ "ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before update atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate # true_rew is the reward without discount if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, seg["true_rew"].reshape( (self.n_envs, -1)), seg["dones"].reshape( (self.n_envs, -1)), writer, self.num_timesteps) args = seg["ob"], seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] self.assign_old_eq_new(sess=self.sess) with self.timed("computegrad"): steps = self.num_timesteps + (k + 1) * ( seg["total_timestep"] / self.g_step) run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata( ) if self.full_tensorboard_log else None # run loss backprop with summary, and save the metadata (memory, compute time, ...) if writer is not None: summary, grad, *lossbefore = self.compute_lossandgrad( *args, tdlamret, sess=self.sess, options=run_options, run_metadata=run_metadata) if self.full_tensorboard_log: writer.add_run_metadata( run_metadata, 'step%d' % steps) writer.add_summary(summary, steps) else: _, grad, *lossbefore = self.compute_lossandgrad( *args, tdlamret, sess=self.sess, options=run_options, run_metadata=run_metadata) lossbefore = self.allmean(np.array(lossbefore)) grad = self.allmean(grad) if np.allclose(grad, 0): logger.log("Got zero gradient. not updating") else: with self.timed("conjugate_gradient"): stepdir = conjugate_gradient( fisher_vector_product, grad, cg_iters=self.cg_iters, verbose=self.rank == 0 and self.verbose >= 1) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot( fisher_vector_product(stepdir)) # abs(shs) to avoid taking square root of negative values lagrange_multiplier = np.sqrt( abs(shs) / self.max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lagrange_multiplier expectedimprove = grad.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = self.get_flat() thnew = None for _ in range(10): thnew = thbefore + fullstep * stepsize self.set_from_flat(thnew) mean_losses = surr, kl_loss, *_ = self.allmean( np.array( self.compute_losses(*args, sess=self.sess))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(mean_losses).all(): logger.log( "Got non-finite value of losses -- bad!" ) elif kl_loss > self.max_kl * 1.5: logger.log( "violated KL constraint. shrinking step." ) elif improve < 0: logger.log( "surrogate didn't improve. shrinking step." ) else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") self.set_from_flat(thbefore) if self.nworkers > 1 and iters_so_far % 20 == 0: # list of tuples paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), self.vfadam.getflat().sum())) assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with self.timed("vf"): for _ in range(self.vf_iters): # NOTE: for recurrent policies, use shuffle=False? for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128, shuffle=True): grad = self.allmean( self.compute_vflossandgrad( mbob, mbob, mbret, sess=self.sess)) self.vfadam.update(grad, self.vf_stepsize) for (loss_name, loss_val) in zip(self.loss_names, mean_losses): logger.record_tabular(loss_name, loss_val) logger.record_tabular( "explained_variance_tdlam_before", explained_variance(vpredbefore, tdlamret)) if self.using_gail: # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, self.reward_giver.loss_name)) assert len(observation) == self.timesteps_per_batch batch_size = self.timesteps_per_batch // self.d_step # NOTE: uses only the last g step for observation d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch # NOTE: for recurrent policies, use shuffle=False? for ob_batch, ac_batch in dataset.iterbatches( (observation, action), include_final_partial_batch=False, batch_size=batch_size, shuffle=True): ob_expert, ac_expert = self.expert_dataset.get_next_batch( ) # update running mean/std for reward_giver if self.reward_giver.normalize: self.reward_giver.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) # Reshape actions if needed when using discrete actions if isinstance(self.action_space, gym.spaces.Discrete): if len(ac_batch.shape) == 2: ac_batch = ac_batch[:, 0] if len(ac_expert.shape) == 2: ac_expert = ac_expert[:, 0] *newlosses, grad = self.reward_giver.lossandgrad( ob_batch, ac_batch, ob_expert, ac_expert) self.d_adam.update(self.allmean(grad), self.d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) # lr: lengths and rewards lr_local = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]) # local values list_lr_pairs = MPI.COMM_WORLD.allgather( lr_local) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*list_lr_pairs)) true_reward_buffer.extend(true_rets) else: # lr: lengths and rewards lr_local = (seg["ep_lens"], seg["ep_rets"] ) # local values list_lr_pairs = MPI.COMM_WORLD.allgather( lr_local) # list of tuples lens, rews = map(flatten_lists, zip(*list_lr_pairs)) len_buffer.extend(lens) reward_buffer.extend(rews) if len(len_buffer) > 0: logger.record_tabular("EpLenMean", np.mean(len_buffer)) logger.record_tabular("EpRewMean", np.mean(reward_buffer)) if self.using_gail: logger.record_tabular("EpTrueRewMean", np.mean(true_reward_buffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) current_it_timesteps = MPI.COMM_WORLD.allreduce( seg["total_timestep"]) timesteps_so_far += current_it_timesteps self.num_timesteps += current_it_timesteps iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", self.num_timesteps) logger.record_tabular("TimeElapsed", time.time() - t_start) if self.verbose >= 1 and self.rank == 0: logger.dump_tabular() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2", eval_every_n=5, reset_num_timesteps=True, record_video=False, log_dir=""): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) self.episode_reward = np.zeros((self.n_envs, )) ep_info_buf = deque(maxlen=100) t_first_start = time.time() nupdates = total_timesteps // self.n_batch for update in range(1, nupdates + 1): if update % eval_every_n == 1: print("[RAISIM_GYM] Visualizing in RaiSimOgre") obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = \ runner.run(test_mode=True, record_video=record_video, video_name=log_dir+"/"+str(update-1)+".mp4") print("Average rewards in this test episode ", ep_infos[0]['r']) # tensorboard_log(logger, ep_infos, self.sess) assert self.n_batch % self.nminibatches == 0 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / nupdates lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run( ) ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, writer=writer, update=timestep)) self.num_timesteps += (self.n_batch * self.noptepochs ) // batch_size * update_fac else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, update=timestep, writer=writer, states=mb_states)) self.num_timesteps += (self.n_envs * self.noptepochs ) // envs_per_batch * update_fac loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'ep_len_mean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break return self
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None, save_interval=None, save_path=None): print('----------------------------------------------') print('| L E A R N |') print('----------------------------------------------') print("num timesteps = " + str(int(total_timesteps / 1000)) + 'k') print("save_interval = " + str(int(save_interval / 1000)) + 'k') print() k = 10 save_interval_st = save_interval new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # 升级 # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_win_rates = [0.0] episode_successes = [] obs, obs_nf = self.env.reset() reset = True self.episode_reward = np.zeros((1,)) self.win_rate = np.zeros((1,)) # print(obs_nf) """ 探索使用prune """ prev2s = [None, None] def input_formate(obs): return obs.transpose((1, 2, 0)) for _ in tqdm(range(total_timesteps)): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True # tf.summary.scalar('update_eps', update_eps) with self.sess.as_default(): # 永不探索 原本为update_eps=update_eps action = self.act(np.array(input_formate(obs))[None], update_eps=-1, **kwargs)[0] filter_action = random.randint(0, 5) if type(obs_nf) == tuple: obs_nf = obs_nf[0] filter_action = feature_utils.get_modify_act(obs_nf, filter_action, prev2s, nokick=True) filter_action = feature_utils.get_act_abs(obs_nf, filter_action, rang=8) # 统计100次filter_actions的概率 fil_acts = [] for _ in range(100): rand_act = random.randint(0, 5) fil_act = feature_utils.get_modify_act(obs_nf, rand_act, prev2s, nokick=True) fil_act = feature_utils.get_act_abs(obs_nf, fil_act, rang=8) fil_acts.append(fil_act) # print('fil', fil_acts) # print() fil_acts = np.eye(65)[fil_acts] # print('eye', fil_acts) # print() fil_acts = fil_acts.sum(axis=0) # print('sum', fil_acts) # print() if random.random() < update_eps: action = filter_action env_action = action reset = False new_obs, rew, done, info, new_obs_nf = self.env.step(env_action) # .ntc self.replay_buffer.add(input_formate(obs), action, rew, input_formate(new_obs), float(done), fil_acts) ''' HER ''' self.temp_buffer.append((obs, action, rew, new_obs, float(done), fil_acts)) if len(self.temp_buffer) >= self.temp_size: for t in range(self.temp_size): s, a, r, s_n, d, fa = self.temp_buffer[t] for k in range(self.k): _s = copy.deepcopy(s) _a = a _r = copy.deepcopy(r) _s_n = copy.deepcopy(s_n) future = np.random.randint(t, self.temp_size) s_f, _a_f, _, _, _, _ = self.temp_buffer[future] g_map = s_f[-2] _s[-1] = g_map # print(_s_n[-2][goal]) if (_s_n[-2] == g_map).all() or ( (_s[-2] == _s[-1]).all() and _a_f == a == 64): # 判断_s是否通过a到达goal # if (_s[-2]) or g == 64: # 是否为原地不动 # print('HER') _r = _r + 0.01 self.replay_buffer.add(input_formate(_s), a, _r, input_formate(_s_n), d, fa) self.temp_buffer.clear() obs = new_obs obs_nf = new_obs_nf if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_win = np.array([info]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) self.win_rate = total_rate_logger(self.win_rate, ep_win, ep_done, writer, self.num_timesteps, name='win_rate') episode_rewards[-1] += rew episode_win_rates[-1] += info if done: maybe_is_success = (rew > 0) # info.get('is_success') # .ntc if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs, obs_nf = self.env.reset() episode_rewards.append(0.0) episode_win_rates.append(0.0) reset = True prev2s = [None, None] # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # print('Sampling ... ...', self.num_timesteps) # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample(self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones, filter_actions = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None # print(rewards.shape) # print(dones.shape) # print(actions.shape) if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) # print("fils", filter_actions) # print("acts", actions) # print(' Training ... ...') if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors, kl_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, filter_actions, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors, kl_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, filter_actions, sess=self.sess) # print('er', pr[0]) # print('kl', pr[1]) # print('x', pr[2]) # print('y', pr[3]) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs(td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities(batch_idxes, new_priorities) if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) if len(episode_win_rates[-101:-1]) == 0: mean_100ep_win_rate = -np.inf else: mean_100ep_win_rate = round(float(np.mean(episode_win_rates[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("mean 100 win rate", mean_100ep_win_rate) logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() # save interval if self.num_timesteps >= save_interval_st: save_interval_st += save_interval s_path = save_path + '_' + str(int(self.num_timesteps / 1000)) + 'k.zip' self.save(save_path=s_path) self.num_timesteps += 1 return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="ACKTR", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) self.n_batch = self.n_envs * self.n_steps self.learning_rate_schedule = Scheduler( initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) # FIFO queue of the q_runner thread is closed at the end of the learn function. # As a result, it needs to be redefinied at every call with self.graph.as_default(): with tf.variable_scope( "kfac_apply", reuse=self.trained, custom_getter=tf_util.outer_scope_getter( "kfac_apply")): # Some of the variables are not in a scope when they are create # so we make a note of any previously uninitialized variables tf_vars = tf.global_variables() is_uninitialized = self.sess.run( [tf.is_variable_initialized(var) for var in tf_vars]) old_uninitialized_vars = [ v for (v, f) in zip(tf_vars, is_uninitialized) if not f ] self.train_op, self.q_runner = self.optim.apply_gradients( list(zip(self.grads_check, self.params))) # then we check for new uninitialized variables and initialize them tf_vars = tf.global_variables() is_uninitialized = self.sess.run( [tf.is_variable_initialized(var) for var in tf_vars]) new_uninitialized_vars = [ v for (v, f) in zip(tf_vars, is_uninitialized) if not f and v not in old_uninitialized_vars ] if len(new_uninitialized_vars) != 0: self.sess.run( tf.variables_initializer(new_uninitialized_vars)) self.trained = True # Use GAE if self.gae_lambda is not None: runner = PPO2Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.gae_lambda) else: runner = A2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma) self.episode_reward = np.zeros((self.n_envs, )) t_start = time.time() coord = tf.train.Coordinator() if self.q_runner is not None: enqueue_threads = self.q_runner.create_threads(self.sess, coord=coord, start=True) else: enqueue_threads = [] # Training stats (when using Monitor wrapper) ep_info_buf = deque(maxlen=100) for update in range(1, total_timesteps // self.n_batch + 1): # true_reward is the reward without discount if isinstance(runner, PPO2Runner): # We are using GAE obs, returns, masks, actions, values, _, states, ep_infos, true_reward = runner.run( ) else: obs, states, returns, masks, actions, values, ep_infos, true_reward = runner.run( ) ep_info_buf.extend(ep_infos) policy_loss, value_loss, policy_entropy = self._train_step( obs, states, returns, masks, actions, values, self.num_timesteps // (self.n_batch + 1), writer) n_seconds = time.time() - t_start fps = int((update * self.n_batch) / n_seconds) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", self.num_timesteps) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(explained_var)) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'ep_len_mean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.dump_tabular() self.num_timesteps += self.n_batch + 1 coord.request_stop() coord.join(enqueue_threads) return self
def learn(self, total_timesteps, env, callback=None, seed=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True, save_file="default"): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) runner = OverideRunner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) self.episode_reward = np.zeros((self.n_envs, )) ep_info_buf = deque(maxlen=100) t_first_start = time.time() nupdates = total_timesteps // self.n_batch print("No of updates: {}".format(nupdates)) print("Total timesteps : {}".format(total_timesteps)) print("Batch size: {}".format(self.n_batch)) for update in range(1, nupdates + 1): assert self.n_batch % self.nminibatches == 0 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / nupdates # frac = 1.0 lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run( ) ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, writer=writer, update=timestep)) self.num_timesteps += (self.n_batch * self.noptepochs ) // batch_size * update_fac if (update * self.n_batch) % 8192 == 0: self.save(save_file + str(update * self.n_batch)) # plot_policy_and_value_fns(self, update * self.n_batch, save_file.split('ppo2_me')[0] + 'policy_plots/') # total_reward, success_episodes = self.test(env) # env.logger.log_scalar('test/success_episodes', success_episodes, update * self.n_batch) # env.logger.log_scalar('test/total_reward', total_reward, update * self.n_batch) # total_rewards.append(total_reward) # total_successes.append(success_episodes) else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, update=timestep, writer=writer, states=mb_states)) self.num_timesteps += (self.n_envs * self.noptepochs ) // envs_per_batch * update_fac loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'ep_len_mean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True, save_interval=None, save_path=None, gamma=0.99, n_steps=128): print('----------------------------------------------') print('| L E A R N |') print('----------------------------------------------') print("num timesteps = " + str(int(total_timesteps / 1000000)) + 'm') # print("num_envs = ", self.num_envs) print("save_interval = " + str(int(save_interval / 1000)) + 'k') print() save_interval_st = save_interval self.gamma = gamma self.n_steps = n_steps # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # 去掉参数 seed ? runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) hindsight_buffer = HindSightBuffer(self.n_steps, self.gamma, self.lam) self.episode_reward = np.zeros((self.n_envs, )) self.win_rate = np.zeros((self.n_envs, )) self.tie_rate = np.zeros((self.n_envs, )) self.loss_rate = np.zeros((self.n_envs, )) # ep_info_buf = deque(maxlen=100) t_first_start = time.time() n_updates = total_timesteps // self.n_batch # self.n_batch = self.n_envs(8) * self.n_steps(128) for update in range(1, n_updates + 1): assert self.n_batch % self.nminibatches == 0 # self.nminibatches == 4 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, true_reward, \ win_rates, tie_rates, loss_rates, obs_nf = runner.run() self.num_timesteps += self.n_batch # ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, update=timestep, writer=writer, states=mb_states, cliprange_vf=cliprange_vf_now)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, 2 * self.n_steps)), masks.reshape((self.n_envs, 2 * self.n_steps)), writer, self.num_timesteps) self.win_rate = total_rate_logger( self.win_rate, win_rates.reshape((self.n_envs, self.n_steps)), masks[:5120].reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps, name='win_rate') self.tie_rate = total_rate_logger( self.tie_rate, tie_rates.reshape((self.n_envs, self.n_steps)), masks[:5120].reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps, name='tie_rate') self.loss_rate = total_rate_logger( self.loss_rate, loss_rates.reshape((self.n_envs, self.n_steps)), masks[:5120].reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps, name='loss_rate') if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) # if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: # logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) # logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # save interval if self.num_timesteps >= save_interval_st: save_interval_st += save_interval s_path = save_path + '_' + str( int(self.num_timesteps / 10000)) + 'k.zip' self.save(save_path=s_path) return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="SAC", print_freq=100): with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) start_time = time.time() episode_rewards = [0.0] is_teleop_env = hasattr(self.env, "wait_for_teleop_reset") # TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() info = {"cte": 0.0} else: obs = self.env.reset() info = {"cte": 0.0} self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) ep_len = 0 self.n_updates = 0 infos_values = [] mb_infos_vals = [] # ---------------------load the trained NN for safety signal tf_obs = tf.placeholder(tf.float32, shape=(1, 104)) hidden1 = tf.layers.dense(tf_obs, 64, tf.nn.relu) hidden2 = tf.layers.dense(hidden1, 16, tf.nn.relu) output1 = tf.layers.dense(hidden2, 2) hidden3 = tf.layers.dense(tf_obs, 64, tf.nn.relu) hidden4 = tf.layers.dense(hidden3, 16, tf.nn.relu) output2 = tf.layers.dense(hidden4, 3) sess = tf.Session() saver = tf.train.Saver() saver.restore(sess, "./saved_params/param03-level1-quad/safe_layer") # -------------------------------------------------------- fr = open("dump_reward.txt", "w") fv = open("dump_violation.txt", "w") cum_reward = [] num_vio = 0 for step in range(total_timesteps): # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy. if step < self.learning_starts: action = self.env.action_space.sample() # No need to rescale when sampling random action rescaled_action = action else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape # ---------- use trained NN to revise the action if action[1] < 0: action[1] *= -1 print("h1, action ", action) proposed_action = action.copy() action_take = action.copy() proposed_action = np.asarray(proposed_action).reshape((1, 2)) #print ("h2, proposed_action", proposed_action) #print("obs shape", obs.shape) v1 = sess.run(output1, {tf_obs: obs.reshape((1, 104))}) v2 = sess.run(output2, {tf_obs: obs.reshape((1, 104))}) q = [v2[0][0], 0.5 * v2[0][1], 0.5 * v2[0][1], v2[0][2]] q = np.reshape(q, (2, 2)) x = cvx.Variable(1, 2) obj = cvx.sum_squares(x - proposed_action) cons = [info["cte"] + v1 * x.T + x * q * x.T <= 4.8, x[1] > 0] prob = cvx.Problem(cvx.Minimize(obj), cons) try: qcqp = QCQP(prob) qcqp.suggest(SDR) f_cd, v_cd = qcqp.improve(COORD_DESCENT) print( "Coordinate descent: objective %.3f, violation %.3f" % (f_cd, v_cd)) if v_cd == 0: new_action = x.value new_action = np.asarray(new_action).reshape((1, 2)) print("h5, action ", new_action) action_take[0] = new_action[0][0] action_take[1] = new_action[0][1] new_obs, reward, done, new_info = self.env.step( action_take) action = action_take else: new_obs, reward, done, new_info = self.env.step(action) except: new_obs, reward, done, new_info = self.env.step(action) # ----------------------------------------- ep_len += 1 if (len(cum_reward) == 10): cum_reward.pop(0) cum_reward.append(reward) curr = 0.0 for i in range(len(cum_reward)): idx = len(cum_reward) - i - 1 curr += cum_reward[idx] * (0.99**i) fr.write("%f \n" % (curr)) fv.write("%d \n" % (num_vio)) if print_freq > 0 and ep_len % print_freq == 0 and ep_len > 0: print("{} steps".format(ep_len)) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs info = new_info # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, step) if ep_len > self.train_freq: print("Additional training") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) done = True episode_rewards[-1] += reward if done: num_vio += 1 if not (isinstance(self.env, VecEnv) or is_teleop_env): obs = self.env.reset() print("Episode finished. Reward: {:.2f} {} Steps".format( episode_rewards[-1], ep_len)) episode_rewards.append(0.0) ep_len = 0 mb_infos_vals = self.optimize(step, writer, current_lr) # Refresh obs when using TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv( 'ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", self.n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', "{:.2f}".format(time.time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", step) logger.dumpkvs() # Reset infos: infos_values = [] if is_teleop_env: self.env.is_training = False # Use last batch print("Final optimization before saving") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] tra_obs = [] ep_count = 0 selected_goal = None tra_count = 0 for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if (self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration): # No need to rescale when sampling random action rescaled_action = action = self.env.action_space.sample() else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) ################################################################# # fit density model and update goal proposing model skew_explore_obs = obs.copy() if isinstance(self.env, HERGoalEnvWrapper): skew_explore_obs_dict = self.env.convert_obs_to_dict( skew_explore_obs) skew_explore_obs = np.array( [skew_explore_obs_dict['observation']]) tra_obs.append(skew_explore_obs[0]) if selected_goal is None: selected_goal = np.array( skew_explore_obs_dict['desired_goal']) else: tra_obs.append(skew_explore_obs) self.skew_explore.update_history(skew_explore_obs, [done]) if (step % self.goal_update_frequency == 0 and step != 0) or step == 2000: logging.info('update buffer') self.skew_explore.activate_buffer() ################################################################# # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): if self.num_timesteps < self.batch_size or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done: self.plot_tra(tra_count, tra_obs, selected_goal) tra_obs = [] selected_goal = None if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() ep_count += 1 episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) tra_count += 1 self.save(self.args.save_path + '/model') if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_log_name="CLAC", reset_num_timesteps=True, randomization=0): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] learning_results = pd.DataFrame() obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] reward_data = pd.DataFrame() for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy. if self.num_timesteps < self.learning_starts: if (isinstance(self.env.action_space, Discrete)): action = [] for _ in range(self.env.action_space.n): action.append(1 / self.env.action_space.n) rescaled_action = self.env.action_space.sample() else: action = self.env.action_space.sample() # No need to rescale when sampling random action rescaled_action = action else: if (isinstance(self.env.action_space, Discrete)): actions = list(range(self.env.action_space.n)) action = self.policy_tf.step( obs[None], deterministic=False).flatten() rescaled_action = np.random.choice(actions, 1, p=action)[0] else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs( self.action_space.low) if (not isinstance(self.env.action_space, Discrete)): assert action.shape == self.env.action_space.shape # If coinrunner environment # rescaled_action = np.array(rescaled_action, ndmin=1) new_obs, reward, done, info = self.env.step(rescaled_action) act_mu, act_std = self.policy_tf.proba_step(obs[None]) if (len(act_std) == 1): act_std = act_std[0] #print("ACT MU FROM PROBA STEP", act_mu) #print("ACT STD FROM PROBA STEP", act_std) if self.num_timesteps > self.learning_starts: # Only update marginal approximation after learning starts is completed if (self.multivariate_mean is None): self.multivariate_mean = act_mu else: previous_mean = self.multivariate_mean self.multivariate_mean = ( (1 - self.learning_rate_phi) * self.multivariate_mean) + (self.learning_rate_phi * act_mu) if (self.multivariate_cov is None): self.multivariate_cov = np.diag(act_std) else: cov = (self.learning_rate_phi * np.diag(act_std) + (1 - self.learning_rate_phi) * self.multivariate_cov) mom_1 = (self.learning_rate_phi * np.square(np.diag(act_mu))) + ( (1 - self.learning_rate_phi) * np.square(np.diag(previous_mean))) mom_2 = np.square((self.learning_rate_phi * np.diag(act_mu)) + (1 - self.learning_rate_phi) * np.diag(previous_mean)) self.multivariate_cov = cov + mom_1 - mom_2 # Update Beta parameter if coef_schedule is set if (self.coef_schedule is not None and self.mut_inf_coef > 1e-12): # (1 - a) B + a(1/L()) # Loss based update schdule, for later # Currently using linear schedule: self.mut_inf_coef *= (1 - self.coef_schedule) """if(self.num_timesteps % 1000 == 0): print("updated mut_inf_coef: ", self.mut_inf_coef, " at time step ", self.num_timesteps)""" # Store transition in the replay buffer. #print("adding action to replay buffer: ", action) self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper # info = info[0] maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): if self.num_timesteps < self.batch_size or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: for mb_info_val in mb_infos_vals: for mb_info in mb_info_val: if mb_info is not None: infos_values.append(np.mean(mb_info)) #infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done: if not isinstance(self.env, VecEnv): obs = self.env.reset() if (randomization == 1): try: for env in self.env.unwrapped.envs: env.randomize() except: print( "Trying to randomize an environment that is not set up for randomization, check environment file" ) assert (False) if (randomization == 2): try: for env in self.env.unwrapped.envs: env.randomize_extreme() except: print( "Trying to extremely randomize an environment that is not set up for randomization, check environment file" ) assert (False) Model_String = "CLAC" if not self.auto_mut_inf_coef: Model_String = "CLAC " + str(self.mut_inf_coef) env_name = self.env.unwrapped.envs[0].spec.id mut_inf_coef = self.init_mut_inf_coef if (type(self.mut_inf_coef) == tf.Tensor or np.isnan(mut_inf_coef)): mut_inf_coef = "auto" Model_String = "CLAC" + str(mut_inf_coef) d = { 'Episode Reward': episode_rewards[-1], 'Coefficient': mut_inf_coef, 'Timestep': self.num_timesteps, 'Episode Number': len(episode_rewards) - 1, 'Env': env_name, 'Randomization': randomization, 'Model': "CLAC" } learning_results = learning_results.append( d, ignore_index=True) self.tf_logged_reward = episode_rewards[-1] episode_rewards.append(0.0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv( 'ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return (self, learning_results)
def run(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_log_name="CLAC", reset_num_timesteps=True, randomization=0): start_time = time.time() episode_rewards = [0.0] learning_results = pd.DataFrame() obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] reward_data = pd.DataFrame() for step in range(total_timesteps): if (isinstance(self.env.action_space, Discrete)): actions = list(range(self.env.action_space.n)) action = self.policy_tf.step(obs[None], deterministic=False).flatten() rescaled_action = np.random.choice(actions, 1, p=action)[0] else: action = self.policy_tf.step(obs[None], deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) new_obs, reward, done, info = self.env.step(rescaled_action) act_mu, act_std = self.policy_tf.proba_step(obs[None]) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper # info = info[0] maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) episode_rewards[-1] += reward if done: if not isinstance(self.env, VecEnv): obs = self.env.reset() if (randomization == 1): try: for env in self.env.unwrapped.envs: env.randomize() except: print( "Trying to randomize an environment that is not set up for randomization, check environment file" ) assert (False) if (randomization == 2): try: for env in self.env.unwrapped.envs: env.randomize_extreme() except: print( "Trying to extremely randomize an environment that is not set up for randomization, check environment file" ) assert (False) Model_String = "CLAC" if not self.auto_mut_inf_coef: Model_String = "CLAC " + str(self.init_mut_inf_coef) env_name = self.env.unwrapped.envs[0].spec.id mut_inf_coef = self.init_mut_inf_coef if (type(self.mut_inf_coef) == tf.Tensor or np.isnan(mut_inf_coef)): mut_inf_coef = "auto" Model_String = "CLAC" + str(mut_inf_coef) d = { 'Episode Reward': episode_rewards[-1], 'Coefficient': mut_inf_coef, 'Timestep': self.num_timesteps, 'Episode Number': len(episode_rewards) - 1, 'Env': env_name, 'Randomization': randomization, 'Model': "CLAC" } learning_results = learning_results.append(d, ignore_index=True) self.tf_logged_reward = episode_rewards[-1] episode_rewards.append(0.0) return (self, learning_results)
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="SAC", print_freq=100): with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) start_time = time.time() episode_rewards = [0.0] is_teleop_env = hasattr(self.env, "wait_for_teleop_reset") # TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() else: obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) ep_len = 0 self.n_updates = 0 infos_values = [] mb_infos_vals = [] for step in range(total_timesteps): # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy. if step < self.learning_starts: action = self.env.action_space.sample() # No need to rescale when sampling random action rescaled_action = action else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) ep_len += 1 if print_freq > 0 and ep_len % print_freq == 0 and ep_len > 0: print("{} steps".format(ep_len)) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, step) if ep_len > self.train_freq: print("Additional training") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) done = True episode_rewards[-1] += reward if done: if not (isinstance(self.env, VecEnv) or is_teleop_env): obs = self.env.reset() print("Episode finished. Reward: {:.2f} {} Steps".format( episode_rewards[-1], ep_len)) episode_rewards.append(0.0) ep_len = 0 mb_infos_vals = self.optimize(step, writer, current_lr) # Refresh obs when using TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv( 'ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", self.n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', "{:.2f}".format(time.time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", step) logger.dumpkvs() # Reset infos: infos_values = [] if is_teleop_env: self.env.is_training = False # Use last batch print("Final optimization before saving") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, initial_p=1.0): self.actions_weights = [] self.actions_container = [] new_tb_log = self._init_num_timesteps(reset_num_timesteps) cnt = 0 ds_rewards = [[0, 0]] with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=initial_p, final_p=self.exploration_final_eps) episode_rewards = [0.0] obs = self.env.reset() reset = True self.episode_reward = np.zeros((1, )) for _ in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True ''' Hierarchical Step (Start) ''' obs, new_obs, rew, action, done, reset = self.hierarchical_step( obs, ds_rewards, cnt, kwargs, update_eps) ''' Hierarchical Step (End) ''' if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += rew if done: if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True if self.num_timesteps > self.learning_starts and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: try: new_priorities = np.array([ abs(x) for x in td_errors.tolist() ]) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) except AssertionError: print(td_errors) if self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() self.num_timesteps += 1 return self, ds_rewards
def learn(self, total_timesteps, callback=None, seed=None, log_interval=None, tb_log_name="DDPG", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: # a list for tensorboard logging, to prevent logging with the same step number, if it already occured self.tb_seen_steps = [] # rank = MPI.COMM_WORLD.Get_rank() # we assume symmetric actions. assert np.all( np.abs(self.env.action_space.low) == self.env.action_space.high) if self.verbose >= 2: logger.log('Using agent with the following configuration:') logger.log(str(self.__dict__.items())) with self.sess.as_default(), self.graph.as_default(): # Prepare everything. self._reset() obs = self.env.reset() eval_obs = None if self.eval_env is not None: eval_obs = self.eval_env.reset() episode_rewards_deque = deque(maxlen=100) eval_episode_rewards_deque = deque(maxlen=100) self.episode_reward = np.zeros((1, )) episode_successes = [] episode_rewards_all = [] episode_steps_all = [] episode_reward = 0. episode_step = 0 total_steps = 0 step_since_eval = 0 total_episode_num = 0 start_time = time.time() while True: # Perform rollouts. qs_this_rollout_period = [] actions_this_rollout_period = [] while True: if total_steps >= total_timesteps: return self # Predict next action. if total_steps <= 10000: action = self.env.action_space.sample() q_value = 0 else: action, q_value = self._policy(obs, apply_noise=True, compute_q=True) assert action.shape == self.env.action_space.shape rescaled_action = action * np.abs( self.action_space.low) new_obs, reward, done, info = self.env.step( rescaled_action) if writer is not None: ep_rew = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) total_steps += 1 self.num_timesteps += 1 episode_reward += reward episode_step += 1 step_since_eval += 1 # Book-keeping. actions_this_rollout_period.append(action) qs_this_rollout_period.append(q_value) self._store_transition(obs, action, reward, new_obs, done) obs = new_obs if done: # Episode done. episode_rewards_all.append(episode_reward) episode_rewards_deque.append(episode_reward) episode_steps_all.append(episode_step) episode_reward = 0. episode_step = 0 total_episode_num += 1 maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append( float(maybe_is_success)) self._reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() break # Train. actor_losses_this_train_period = [] critic_losses_this_train_period = [] last_episode_step = int(episode_steps_all[-1]) for t_train in range(last_episode_step): # Not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size): break # weird equation to deal with the fact the nb_train_steps will be different # to nb_rollout_steps step = total_steps - last_episode_step + t_train critic_loss, actor_loss = self._train_step( step, writer, do_actor_update=t_train % 2 == 0) critic_losses_this_train_period.append(critic_loss) if actor_loss: actor_losses_this_train_period.append(actor_loss) self._update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if self.eval_env is not None and step_since_eval >= self.eval_freq: step_since_eval %= self.eval_freq eval_episode_reward = 0. eval_episode = 0 while eval_episode < 10: eval_action, eval_q = self._policy( eval_obs, apply_noise=False, compute_q=True) eval_obs, eval_r, eval_done, _ = self.eval_env.step( eval_action * np.abs(self.action_space.low)) eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: if not isinstance(self.env, VecEnv): eval_obs = self.eval_env.reset() eval_episode_rewards.append( eval_episode_reward) eval_episode_rewards_deque.append( eval_episode_reward) eval_episode_reward = 0. eval_episode += 1 if callback is not None: # Only stop training if return value is False, not when it is None. # This is for backwards compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: return self # mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = self._get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = episode_rewards_all[-1] combined_stats['rollout/return_last_100'] = np.mean( episode_rewards_deque) combined_stats[ 'rollout/episode_steps'] = episode_steps_all[-1] combined_stats['debug/actions_mean'] = np.mean( actions_this_rollout_period) combined_stats['debug/actions_std'] = np.std( actions_this_rollout_period) combined_stats['debug/Q_mean'] = np.mean( qs_this_rollout_period) combined_stats['train/loss_actor'] = np.mean( actor_losses_this_train_period) combined_stats['train/loss_critic'] = np.mean( critic_losses_this_train_period) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float( total_steps) / float(duration) # Evaluation statistics. if self.eval_env is not None and eval_episode_rewards: combined_stats['eval/return'] = np.mean( eval_episode_rewards) combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_deque) combined_stats['eval/Q'] = np.mean(eval_qs) combined_stats['eval/episodes'] = len( eval_episode_rewards) def as_scalar(scalar): """ check and return the input if it is a scalar, otherwise raise ValueError :param scalar: (Any) the object to check :return: (Number) the scalar if x is a scalar """ if isinstance(scalar, np.ndarray): assert scalar.size == 1 return scalar[0] elif np.isscalar(scalar): return scalar else: raise ValueError('expected scalar, got %s' % scalar) # combined_stats_sums = MPI.COMM_WORLD.allreduce( # np.array([as_scalar(x) for x in combined_stats.values()])) # combined_stats = {k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/episodes'] = total_episode_num combined_stats['total/steps'] = total_steps for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.dump_tabular() logger.info('') logdir = logger.get_dir()
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True, vae=None): # making the learning rate and clip range callable here. self.writer2 = tf.summary.FileWriter('/tmp/ppo/PPO_test', graph=self.graph) self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) new_tb_log = self._init_num_timesteps( reset_num_timesteps=reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) self.episode_reward = np.zeros((self.n_envs, )) ep_info_buf = deque(maxlen=100) t_first_start = time.time() nupdates = total_timesteps // self.n_batch for update in range(1, nupdates + 1): assert self.n_batch % self.nminibatches == 0 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / nupdates lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run( ) ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, writer=writer, update=timestep)) self.num_timesteps += (self.n_batch * self.noptepochs ) // batch_size * update_fac """Optimize the VAE""" time_start = time.time() vae.optimize() print("Time to optimize the VAE: ", time.time() - time_start) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if update % 2 == 0: self.env.testing = True ob = np.zeros((self.env.num_envs, ) + self.env.observation_space.shape) ob[:] = self.env.reset() total_reward_test = 0 for i in range(1000): print("TESTING") actions = self.step(ob) ob[:], reward, _, _ = self.env.step(actions) total_reward_test += total_reward_test + reward summary2 = tf.Summary(value=[ tf.Summary.Value(tag="episode_reward", simple_value=total_reward_test) ]) self.writer2.add_summary(summary2) else: self.env.testing = False if writer is not None and not self.env.testing: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) logger.logkv( 'ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="A2C"): with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) self.learning_rate_schedule = Scheduler( initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) # Entropy tobe a large in the beginning self.ent_coef_schedule = LinearSchedule( schedule_timesteps=int(1e6), initial_p=0.1, final_p=0.01) runner = A2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma) self.episode_reward = np.zeros((self.n_envs, )) t_start = time.time() for update in range(1, total_timesteps // self.n_batch + 1): # true_reward is the reward without discount obs, states, rewards, masks, actions, values, true_reward = runner.run( ) _, value_loss, policy_entropy = self._train_step( update * self.n_batch, obs, states, rewards, masks, actions, values, update, writer) n_seconds = time.time() - t_start fps = int((update * self.n_batch) / n_seconds) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, update * (self.n_batch + 1)) if callback is not None: callback(locals(), globals()) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * self.n_batch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(explained_var)) logger.dump_tabular() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="PPO1"): with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO1 model must be " \ "an instance of common.policies.ActorCriticPolicy." with self.sess.as_default(): self.adam.sync() # Prepare for rollouts seg_gen = traj_segment_generator(self.policy_pi, self.env, self.timesteps_per_actorbatch) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 t_start = time.time() # rolling buffer for episode lengths lenbuffer = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer = deque(maxlen=100) self.episode_reward = np.zeros((self.n_envs, )) while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break if self.schedule == 'constant': cur_lrmult = 1.0 elif self.schedule == 'linear': cur_lrmult = max( 1.0 - float(timesteps_so_far) / total_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, self.gamma, self.lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) obs_ph, action_ph, atarg, tdlamret = seg["ob"], seg[ "ac"], seg["adv"], seg["tdlamret"] # true_rew is the reward without discount if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, seg["true_rew"].reshape( (self.n_envs, -1)), seg["dones"].reshape( (self.n_envs, -1)), writer, timesteps_so_far) # predicted value function before udpate vpredbefore = seg["vpred"] # standardized advantage function estimate atarg = (atarg - atarg.mean()) / atarg.std() dataset = Dataset( dict(ob=obs_ph, ac=action_ph, atarg=atarg, vtarg=tdlamret), shuffle=not issubclass(self.policy, LstmPolicy)) optim_batchsize = self.optim_batchsize or obs_ph.shape[0] # set old parameter values to new parameter values self.assign_old_eq_new(sess=self.sess) logger.log("Optimizing...") logger.log(fmt_row(13, self.loss_names)) # Here we do a bunch of optimization epochs over the data for k in range(self.optim_epochs): # list of tuples, each of which gives the loss for a minibatch losses = [] for i, batch in enumerate( dataset.iterate_once(optim_batchsize)): steps = ( timesteps_so_far + k * optim_batchsize + int(i * (optim_batchsize / len(dataset.data_map)))) if writer is not None: # run loss backprop with summary, but once every 10 runs save the metadata # (memory, compute time, ...) if (1 + k) % 10 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, grad, *newlosses = self.lossandgrad( batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % steps) else: summary, grad, *newlosses = self.lossandgrad( batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess) writer.add_summary(summary, steps) else: _, grad, *newlosses = self.lossandgrad( batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess) self.adam.update(grad, self.optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in dataset.iterate_once(optim_batchsize): newlosses = self.compute_losses(batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess) losses.append(newlosses) mean_losses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, mean_losses)) for (loss_val, name) in zipsame(mean_losses, self.loss_names): logger.record_tabular("loss_" + name, loss_val) logger.record_tabular( "ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # local values lrlocal = (seg["ep_lens"], seg["ep_rets"]) # list of tuples listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += MPI.COMM_WORLD.allreduce( seg["total_timestep"]) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - t_start) if self.verbose >= 1 and MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() return self
def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() t_first_start = time.time() n_updates = total_timesteps // self.n_batch callback.on_training_start(locals(), globals()) for update in range(1, n_updates + 1): assert self.n_batch % self.nminibatches == 0, ( "The number of minibatches (`nminibatches`) " "is not a factor of the total number of samples " "collected per rollout (`n_batch`), " "some samples won't be used.") batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) callback.on_rollout_start() # true_reward is the reward without discount rollout = self.runner.run(callback) # Unpack obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = rollout callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break self.ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, update=timestep, writer=writer, states=mb_states, cliprange_vf=cliprange_vf_now)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'ep_len_mean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() # Save the model every 5000 time_step if self.num_timesteps % 5000 == 0: self.save(self.model_dir + "/step_{}".format(self.num_timesteps)) callback.on_training_end() return self
def learn(self, total_timesteps, env_eval, callback=None, seed=None, path=None, dis_path=None, score_path=None, dis_eval_interval=100, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): self.eval_env = env_eval new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() self.episode_reward = np.zeros((1,)) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] dis_eval_array = [] # (total_step % eval_intervel) x 2 x n_batch self.ep_length = 0 for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if (self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration): # No need to rescale when sampling random action rescaled_action = action = self.env.action_space.sample() noise = np.zeros(self.noise_dim) else: noise = self.policy_tf.gen_noise(obs[None]).flatten() action = self.policy_tf.step(obs[None],noise[None] ,deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) self.ep_length += 1 # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done), noise) episode_rewards[-1] += reward reset_flag = done or self.ep_length >= self.max_ep_length if reset_flag: if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() episode_rewards.append(0.0) self.ep_length = 0 maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) else: obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append(self._train_step(step, writer, current_lr, dis_eval_array, dis_eval_interval, dis_path)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) if self.num_timesteps % 2000 == 0: eval_ob = self.eval_env.reset() eval_epi_rewards = 0 eval_epis = 0 eval_performance = [] eval_ep_step = 0 while True: eval_noise = self.policy_tf.gen_noise(eval_ob[None]).flatten() eval_action = self.policy_tf.step(eval_ob[None], eval_noise[None], deterministic=True).flatten() eval_rescaled_action = eval_action * np.abs(self.action_space.low) eval_new_obs, eval_reward, eval_done, eval_info = self.eval_env.step(eval_rescaled_action) eval_epi_rewards += eval_reward eval_ob = eval_new_obs eval_ep_step += 1 if eval_done or eval_ep_step >= self.max_ep_length: eval_ob = self.eval_env.reset() eval_performance.append(eval_epi_rewards) eval_epi_rewards = 0 eval_epis += 1 eval_ep_step = 0 if eval_epis > 5: break with open(score_path, 'a') as f2: f2.write("%i %f\n" % (self.num_timesteps, np.mean(eval_performance))) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and reset_flag and log_interval is not None and len(episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) with open(path,'a') as f1: f1.write("%f " % step) f1.write("%f " % mean_reward) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) with open(path,'a') as f1: f1.write("%f " % safe_mean([ep_info['r'] for ep_info in ep_info_buf])) f1.write("%f " % safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) with open(path,'a') as f1: f1.write("%f " % n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_successes = [] obs = self.env.reset() reset = True ############################################################ # MODIFICATION: # Track list of actions taken each episode. This is # intentionally not a set so that we can use np.isin. action_list = list() ############################################################ for _ in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): #################################################### # MODIFICATION: # Rename variable from original, since it's now # going to come back as an array due to the # modified build_act function being used to # construct everything. action_arr = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] #################################################### # ORIGINAL: # action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] ######################################################## # MODIFICATION: # Get the best action that has not yet been taken this # episode. action = \ action_arr[np.argmin(np.isin(action_arr, action_list))] # Add this action to the list. action_list.append(action) ######################################################## env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += rew if done: #################################################### # MODIFICATION: # Clear the list. action_list.clear() #################################################### maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities( batch_idxes, new_priorities) if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() self.num_timesteps += 1 return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] obs = self.env.reset() self.episode_reward = np.zeros((1,)) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] obs = self.env.reset() for i in range(128): action = self.env.action_space.sample() new_obs, reward, done, info = self.env.step(action) # print(new_obs) # self.env.render() self.iiayn.update_history([obs]) obs = new_obs for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy. if self.num_timesteps < self.learning_starts: action = self.env.action_space.sample() # No need to rescale when sampling random action rescaled_action = action else: action = self.policy_tf.step(obs[None], deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action #* np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) print(step, action) # self.env.render() self.iiayn.update_history([obs]) if step % 2048 == 0: self.iiayn.activate_buffer() # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): if self.num_timesteps < self.batch_size or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append(self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done or step%1024 == 0: obs = self.env.reset() # if not isinstance(self.env, VecEnv): # obs = self.env.reset() episode_rewards.append(0.0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="ACER"): with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) self.learning_rate_schedule = Scheduler( initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) episode_stats = EpisodeStats(self.n_steps, self.n_envs) runner = _Runner(env=self.env, model=self, n_steps=self.n_steps) self.episode_reward = np.zeros((self.n_envs, )) if self.replay_ratio > 0: buffer = Buffer(env=self.env, n_steps=self.n_steps, size=self.buffer_size) else: buffer = None t_start = time.time() # n_batch samples, 1 on_policy call and multiple off-policy calls for steps in range(0, total_timesteps, self.n_batch): enc_obs, obs, actions, rewards, mus, dones, masks = runner.run( ) episode_stats.feed(rewards, dones) if buffer is not None: buffer.put(enc_obs, actions, rewards, mus, dones, masks) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, rewards.reshape((self.n_envs, self.n_steps)), dones.reshape((self.n_envs, self.n_steps)), writer, steps) # reshape stuff correctly obs = obs.reshape(runner.batch_ob_shape) actions = actions.reshape([runner.n_batch]) rewards = rewards.reshape([runner.n_batch]) mus = mus.reshape([runner.n_batch, runner.n_act]) dones = dones.reshape([runner.n_batch]) masks = masks.reshape([runner.batch_ob_shape[0]]) names_ops, values_ops = self._train_step( obs, actions, rewards, dones, mus, self.initial_state, masks, steps, writer) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) == False: break if self.verbose >= 1 and (int(steps / runner.n_batch) % log_interval == 0): logger.record_tabular("total_timesteps", steps) logger.record_tabular("fps", int(steps / (time.time() - t_start))) # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, # not just at the terminal state. Thus, this is mean until end of life, not end of episode. # For true episode rewards, see the monitor files in the log folder. logger.record_tabular("mean_episode_length", episode_stats.mean_length()) logger.record_tabular("mean_episode_reward", episode_stats.mean_reward()) for name, val in zip(names_ops, values_ops): logger.record_tabular(name, float(val)) logger.dump_tabular() if self.replay_ratio > 0 and buffer.has_atleast( self.replay_start): samples_number = np.random.poisson(self.replay_ratio) for _ in range(samples_number): # get obs, actions, rewards, mus, dones from buffer. obs, actions, rewards, mus, dones, masks = buffer.get() # reshape stuff correctly obs = obs.reshape(runner.batch_ob_shape) actions = actions.reshape([runner.n_batch]) rewards = rewards.reshape([runner.n_batch]) mus = mus.reshape([runner.n_batch, runner.n_act]) dones = dones.reshape([runner.n_batch]) masks = masks.reshape([runner.batch_ob_shape[0]]) self._train_step(obs, actions, rewards, dones, mus, self.initial_state, masks, steps) return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DDPG", \ reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # a list for tensorboard logging, to prevent logging with the same step number, if it already occured self.tb_seen_steps = [] rank = MPI.COMM_WORLD.Get_rank() # we assume symmetric actions. assert np.all( np.abs(self.env.action_space.low) == self.env.action_space.high) if self.verbose >= 2: logger.log('Using agent with the following configuration:') logger.log(str(self.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) self.episode_reward = np.zeros((1, )) episode_successes = [] with self.sess.as_default(), self.graph.as_default(): # Prepare everything. self._reset() obs = self.env.reset() eval_obs = None if self.eval_env is not None: eval_obs = self.eval_env.reset() episode_reward = 0. episode_step = 0 episodes = 0 step = 0 total_steps = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] eval_episode_rewards = [] eval_qs = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 epoch = 0 while True: for _ in range(log_interval): # Perform rollouts. for _ in range(self.nb_rollout_steps): if total_steps >= total_timesteps: return self # Predict next action. action, q_value = self._policy(obs, apply_noise=True, compute_q=True) assert action.shape == self.env.action_space.shape # Execute next action. if rank == 0 and self.render: self.env.render() # Randomly sample actions from a uniform distribution # with a probabilty self.random_exploration (used in HER + DDPG) if np.random.rand() < self.random_exploration: rescaled_action = action = self.action_space.sample( ) else: rescaled_action = action * np.abs( self.action_space.low) rescaled_action = np.where(action)[0][0] new_obs, reward, done, info = self.env.step( rescaled_action) if writer is not None: ep_rew = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) step += 1 total_steps += 1 self.num_timesteps += 1 if rank == 0 and self.render: self.env.render() episode_reward += reward episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q_value) self._store_transition(obs, action, reward, new_obs, done) obs = new_obs if callback is not None: # Only stop training if return value is False, not when it is None. # This is for backwards compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: return self if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append( float(maybe_is_success)) self._reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(self.nb_train_steps): # Not enough samples in the replay buffer if not self.replay_buffer.can_sample( self.batch_size): break # Adapt param noise, if necessary. if len(self.replay_buffer) >= self.batch_size and \ t_train % self.param_noise_adaption_interval == 0: distance = self._adapt_param_noise() epoch_adaptive_distances.append(distance) # weird equation to deal with the fact the nb_train_steps will be different # to nb_rollout_steps step = (int(t_train * (self.nb_rollout_steps / self.nb_train_steps)) + self.num_timesteps - self.nb_rollout_steps) critic_loss, actor_loss = self._train_step( step, writer, log=t_train == 0) epoch_critic_losses.append(critic_loss) epoch_actor_losses.append(actor_loss) self._update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if self.eval_env is not None: eval_episode_reward = 0. for _ in range(self.nb_eval_steps): if total_steps >= total_timesteps: return self eval_action, eval_q = self._policy( eval_obs, apply_noise=False, compute_q=True) eval_obs, eval_r, eval_done, _ = self.eval_env.step( eval_action * np.abs(self.action_space.low)) if self.render_eval: self.eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: if not isinstance(self.env, VecEnv): eval_obs = self.eval_env.reset() eval_episode_rewards.append( eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = self._get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean( epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean( epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean( epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean( epoch_critic_losses) if len(epoch_adaptive_distances) != 0: combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float( step) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std( epoch_actions) # Evaluation statistics. if self.eval_env is not None: combined_stats['eval/return'] = np.mean( eval_episode_rewards) combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = np.mean(eval_qs) combined_stats['eval/episodes'] = len( eval_episode_rewards) def as_scalar(scalar): """ check and return the input if it is a scalar, otherwise raise ValueError :param scalar: (Any) the object to check :return: (Number) the scalar if x is a scalar """ if isinstance(scalar, np.ndarray): assert scalar.size == 1 return scalar[0] elif np.isscalar(scalar): return scalar else: raise ValueError('expected scalar, got %s' % scalar) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array( [as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = step for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(self.env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as file_handler: pickle.dump(self.env.get_state(), file_handler) if self.eval_env and hasattr(self.eval_env, 'get_state'): with open( os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as file_handler: pickle.dump(self.eval_env.get_state(), file_handler)
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) self.episode_reward = np.zeros((self.n_envs, )) ep_info_buf = deque(maxlen=100) t_first_start = time.time() # define the victim_model if self.use_explanation: if self.pretrained_mimic: exp_test = GradientExp(self.mimic_model) else: exp_test = None else: exp_test = None nupdates = total_timesteps // self.n_batch obs_list = [] act_list = [] for update in range(1, nupdates + 1): assert self.n_batch % self.nminibatches == 0 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / nupdates lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward, \ obs_oppo, actions_oppo, o_next, o_opp_next, a_opp_next = runner.run() obs_opp_ph = obs_oppo action_oppo_ph = actions_oppo if update % 100 == 0 and self.save_victim_traj: obs_list.append(obs_oppo) act_list.append(actions_oppo) # todo calculate the attention paid on opponent attention = self.calculate_attention(obs_oppo=obs_opp_ph, action_oppo=action_oppo_ph, \ exp_test=exp_test, black_box_att=self.black_box_att, exp_method=self.exp_method) is_stochastic = False ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) slices_hua = (arr[mbinds] for arr in (a_opp_next, o_opp_next, attention)) mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, *slices_hua, is_stochastic=is_stochastic, ratio=self.mix_ratio, writer=writer, update=timestep)) self.num_timesteps += (self.n_batch * self.noptepochs ) // batch_size * update_fac else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) slices_hua = (arr[mb_flat_inds] for arr in (a_opp_next, o_opp_next, is_stochastic, attention)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, *slices_hua, update=timestep, writer=writer, states=mb_states)) self.num_timesteps += (self.n_envs * self.noptepochs ) // envs_per_batch * update_fac loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) # print the attention weights if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'ep_len_mean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break model_file_name = "{0}agent_{1}.pkl".format( self.model_saved_loc, update * self.n_batch) if self.black_box_att: if update % 1000 == 0: print("Model saved at: {}".format(model_file_name)) self.save(model_file_name) else: if update % 1000 == 0: print("Model saved at: {}".format(model_file_name)) self.save(model_file_name) obs_numpy = np.vstack(obs_list) act_numpy = np.vstack(act_list) with open('../saved/trajectory.pkl', 'ab+') as f: pkl.dump([obs_numpy, act_numpy], f, protocol=2) return self
def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if self.num_timesteps < self.learning_starts or np.random.rand( ) < self.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.action_space, unscaled_action) else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # inferred actions need to be transformed to environment action_space before stepping unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(unscaled_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done: if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv("episode reward", episode_rewards[-2]) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): #print("BEHOOOOOLD A CHANGE!!!!") # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() t_first_start = time.time() n_updates = total_timesteps // self.n_batch counter = 0 for update in range(1, n_updates + 1): assert self.n_batch % self.nminibatches == 0, ( "The number of minibatches (`nminibatches`) " "is not a factor of the total number of samples " "collected per rollout (`n_batch`), " "some samples won't be used.") batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = self.runner.run( ) #************************************************* if counter < 1000: self_adversary = 0 # no adversary else: self_adversary = .0001 self_adversary = self_adversary * counter #increment adversary if self_adversary >= 1: #don't become too nasty now, ya hear? self_adversary = 1 values = values - ((values * self_adversary) * 2) #************************************************* self.num_timesteps += self.n_batch self.ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, update=timestep, writer=writer, states=mb_states, cliprange_vf=cliprange_vf_now)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'ep_len_mean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break return self