def learn(policy, env, seed, ob_space, ac_space, save_name, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100): set_global_seeds(seed) nenvs = env.num_envs #ob_space = env.observation_space #ac_space = env.action_space save_dir = './model/' + save_name + '.ckt' summary_dir = './summary/' + save_name model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, summary_dir=summary_dir) runner = Runner(env, model, ob_space=ob_space, nsteps=nsteps, gamma=gamma) nbatch = nenvs * nsteps tstart = time.time() train_writer = model.train_writer episode_stats = EpisodeStats(nsteps, nenvs) for update in range(1, total_timesteps // nbatch + 1): obs, states, rewards, masks, actions, values, raw_rewards = runner.run( ) episode_stats.feed(raw_rewards, masks) mean_reward = episode_stats.mean_reward() mean_reward = np.asarray(mean_reward, dtype=np.float32) policy_loss, value_loss, policy_entropy, summary = model.train( obs, states, mean_reward, rewards, masks, actions, values) train_writer.add_summary(summary, update * nbatch) nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("episode_reward", episode_stats.mean_reward()) logger.record_tabular("episode_length", episode_stats.mean_length()) logger.dump_tabular() model.save(save_dir) env.close() return model
def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) bestscore = 0 new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) fig = plt.figure() ax = fig.add_subplot(111) x, y = [0], [0] with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() episode_stats = EpisodeStats(self.n_steps, self.n_envs) t_first_start = time.time() n_updates = total_timesteps // self.n_batch callback.on_training_start(locals(), globals()) for update in range(1, n_updates + 1): assert self.n_batch % self.nminibatches == 0, ( "The number of minibatches (`nminibatches`) " "is not a factor of the total number of samples " "collected per rollout (`n_batch`), " "some samples won't be used.") batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) callback.on_rollout_start() # true_reward is the reward without discount rollout = self.runner.run(callback) # Unpack obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = rollout callback.update_locals(locals()) callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break episode_stats.feed(true_reward, masks) self.ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = max( self.n_batch // self.nminibatches // self.noptepochs, 1) inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) else: # recurrent version update_fac = max( self.n_batch // self.nminibatches // self.noptepochs // self.n_steps, 1) assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, update=timestep, writer=writer, states=mb_states, cliprange_vf=cliprange_vf_now)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose == 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) #if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0: #logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf])) #logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf])) logger.logkv("mean_episode_length", episode_stats.mean_length()) logger.logkv("mean_episode_reward", episode_stats.mean_reward()) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if self.verbose == 2 and ( update % log_interval == 0 or update == 1) and episode_stats.mean_reward() > bestscore: bestscore = episode_stats.mean_reward() logger.logkv('time_elapsed', t_start - t_first_start) logger.logkv("mean_episode_reward", bestscore) logger.dumpkvs() x.append(self.num_timesteps) y.append(bestscore) ax.plot(x, y, marker='.', color='b') fig.canvas.draw() ax.set_xlim(left=0, right=total_timesteps) ax.set(title='Street Fighter 2 AI - PPO2 Algorithm', ylabel='Fitness score', xlabel='Timesteps') fig.show() plt.pause(0.001) callback.on_training_end() return self
def learn(policy, env, seed, nsteps=5, nstack=4, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100, max_episode_length=None, optimizer=None): tf.reset_default_graph() set_global_seeds(seed) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space num_procs = len(env.remotes) # HACK model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack, num_procs=num_procs, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, optimizer=optimizer) runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma) stats = EpisodeStats(nsteps, nenvs, maxlen=100) nbatch = nenvs * nsteps tstart = time.time() for update in itertools.count(): obs, states, rewards, masks, actions, values = runner.run() total_loss, policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values) nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) stats.feed(rewards, masks) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("total_loss", float(total_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("mean_episode_length", stats.mean_length()) logger.record_tabular("mean_episode_reward", stats.mean_reward()) logger.dump_tabular() if max_episode_length and stats.mean_length( ) >= max_episode_length: break env.close()