def train(env_id, num_timesteps, seed): set_global_seeds(seed) env = gym.make(env_id) logger_path = None if logger.get_dir() is None else os.path.join( logger.get_dir(), str(0)) env = Monitor(env, logger_path, allow_early_resets=True) env.seed(seed) with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, num_timesteps=num_timesteps, animate=False) env.close()
def _thunk(): env = make_atari(args.env_name, args.max_episode_steps) env.seed(args.seed + rank) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True) return wrap_deepmind(env)
def main(): args = mujoco_arg_parser() logger.configure(dir=args.logdir) nenv = 16 envs = [] for i in range(nenv): e = gym.make(args.env) e.seed(args.seed + 1000 * i) #for repeatability e = Monitor(e, logger.get_dir(), allow_early_resets=True) envs.append(e) envs = DummyVecEnv(envs) envs = VecNormalize(envs) set_global_seeds(args.seed) #for repeatability agent = MlpAgent(envs.observation_space.shape[0], envs.action_space.shape[0]) if args.checkpoint: agent.load_state_dict(torch.load(args.checkpoint)) agent = train(agent, envs, N_steps=5, N_updates=args.updates, batch_size=128, lam=0.95, gamma=0.99, epsilon=1e-5, N_train_sample_epochs=10, log_interval=10, ent_coef=0.01, vf_coef=0.5, lr=1e-5, cliprange=0.2, save_interval=500) if args.play: logger.log("Running trained model") obs = np.zeros((envs.num_envs, ) + envs.observation_space.shape) obs[:] = envs.reset() while True: actions = agent.step(obs)[0] obs[:] = envs.step(actions)[0] envs.render()
def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0, flatten_dict_observations=True, gamestate=None): """ Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo. """ wrapper_kwargs = wrapper_kwargs or {} mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = seed + 10000 * mpi_rank if seed is not None else None logger_dir = logger.get_dir() def make_thunk(rank): return lambda: make_env(env_id=env_id, env_type=env_type, mpi_rank=mpi_rank, subrank=rank, seed=seed, reward_scale=reward_scale, gamestate=gamestate, flatten_dict_observations= flatten_dict_observations, wrapper_kwargs=wrapper_kwargs, logger_dir=logger_dir) set_global_seeds(seed) if num_env > 1: return SubprocVecEnv( [make_thunk(i + start_index) for i in range(num_env)]) else: return DummyVecEnv([make_thunk(start_index)])
def train(agent, env, N_steps, N_updates, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, batch_size=4, N_train_sample_epochs=4, cliprange=0.2, save_interval=0): if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) runner = Runner(env, agent, nsteps=N_steps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) tfirststart = time.time() for update in range(1, N_updates+1): tstart = time.time() obs, returns, dones, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632 epinfobuf.extend(epinfos) frac = 1.0 - (update - 1.0) / N_updates lrnow = lr(frac) cliprangenow = cliprange(frac) optimazer = optim.Adam(agent.parameters(), lr=lrnow) mblossnames = ['policy_loss', 'value_loss', 'entropy', 'approxkl', 'clipfrac'] mblossvals = [] N_sample_steps = obs.shape[0] inds = np.arange(N_sample_steps) agent.train() for _ in range(N_train_sample_epochs): np.random.shuffle(inds) for start in range(0, N_sample_steps, batch_size): end = start + batch_size mbinds = inds[start:end] obs_ = torch.tensor(obs[mbinds], requires_grad=True).float() returns_ = torch.tensor(returns[mbinds]).float() actions_ = torch.tensor(actions[mbinds]).float() values_ = torch.tensor(values[mbinds]).float() neglogpacs_ = torch.tensor(neglogpacs[mbinds]).float() advs_ = returns_ - values_ advs_ = (advs_ - advs_.mean()) / (advs_.std() + 1e-8) optimazer.zero_grad() neglogp, entropy, vpred = agent.statistics(obs_, actions_) entropy = torch.mean(entropy) vpred_clip = values_ + torch.clamp(vpred - values_, -cliprangenow, cliprangenow) vf_loss = torch.max((vpred - returns_) ** 2, (vpred_clip - returns_) ** 2) vf_loss = 0.5 * torch.mean(vf_loss) ratio = torch.exp(neglogpacs_ - neglogp) pg_loss = torch.max(- advs_ * ratio, - advs_ * torch.clamp(ratio, 1.0-cliprangenow, 1.0+cliprangenow)) pg_loss = torch.mean(pg_loss) approxkl = .5 * torch.mean((neglogp - neglogpacs_) ** 2) clipfrac = torch.mean((torch.abs(ratio - 1.0) > torch.tensor(cliprangenow)).float()) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef loss.backward() optimazer.step() mblossvals.append([pg_loss.item(), vf_loss.item(), entropy.item(), approxkl.item(), clipfrac.item()]) lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(N_sample_steps / (tnow - tstart)) if update % log_interval == 0 or update == 1: ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update*N_steps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update*N_sample_steps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, mblossnames): logger.logkv(lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): checkdir = os.path.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = os.path.join(checkdir, '%.5i'%update) print('Saving to', savepath) torch.save(agent.state_dict(), savepath) env.close() return agent
def train(self): self.net_mode(train=True) tfirststart = time.time() epoch_episode_rewards = deque(maxlen=1) epoch_episode_steps = deque(maxlen=1) total_rollout_steps = 0 for epoch in range(self.global_step, self.num_iters): episode_reward = 0 episode_step = 0 self.action_noise.reset() obs = self.env.reset() obs = obs[0] epoch_actor_losses = [] epoch_critic_losses = [] if self.use_her: ep_experi = { 'obs': [], 'act': [], 'reward': [], 'new_obs': [], 'ach_goals': [], 'done': [] } for t_rollout in range(self.rollout_steps): total_rollout_steps += 1 ran = np.random.random(1)[0] if self.pretrain_dir is None and epoch < self.warmup_iter or \ ran < self.random_prob: act = self.random_action().flatten() else: act = self.policy(obs).flatten() new_obs, r, done, info = self.env.step(act) ach_goals = new_obs[1].copy() new_obs = new_obs[0].copy() episode_reward += r episode_step += 1 self.memory.append(obs, act, r * self.reward_scale, new_obs, ach_goals, done) if self.use_her: ep_experi['obs'].append(obs) ep_experi['act'].append(act) ep_experi['reward'].append(r * self.reward_scale) ep_experi['new_obs'].append(new_obs) ep_experi['ach_goals'].append(ach_goals) ep_experi['done'].append(done) if self.ob_norm: self.obs_oms.update(new_obs) obs = new_obs epoch_episode_rewards.append(episode_reward) epoch_episode_steps.append(episode_step) if self.use_her: for t in range(episode_step - self.k_future): ob = ep_experi['obs'][t] act = ep_experi['act'][t] new_ob = ep_experi['new_obs'][t] ach_goal = ep_experi['ach_goals'][t] k_futures = np.random.choice(np.arange( t + 1, episode_step), self.k_future - 1, replace=False) k_futures = np.concatenate((np.array([t]), k_futures)) for future in k_futures: new_goal = ep_experi['ach_goals'][future] her_ob = np.concatenate( (ob[:-self.goal_dim], new_goal), axis=0) her_new_ob = np.concatenate( (new_ob[:-self.goal_dim], new_goal), axis=0) res = self.env.cal_reward(ach_goal.copy(), new_goal, act) her_reward, _, done = res self.memory.append(her_ob, act, her_reward * self.reward_scale, her_new_ob, ach_goal.copy(), done) self.global_step += 1 if epoch >= self.warmup_iter: for t_train in range(self.train_steps): act_loss, cri_loss = self.train_net() epoch_critic_losses.append(cri_loss) epoch_actor_losses.append(act_loss) if epoch % self.log_interval == 0: tnow = time.time() stats = {} if self.ob_norm: stats['ob_oms_mean'] = safemean(self.obs_oms.mean.numpy()) stats['ob_oms_std'] = safemean(self.obs_oms.std.numpy()) stats['total_rollout_steps'] = total_rollout_steps stats['rollout/return'] = safemean( [rew for rew in epoch_episode_rewards]) stats['rollout/ep_steps'] = safemean( [l for l in epoch_episode_steps]) if epoch >= self.warmup_iter: stats['actor_loss'] = np.mean(epoch_actor_losses) stats['critic_loss'] = np.mean(epoch_critic_losses) stats['epoch'] = epoch stats['actor_lr'] = self.actor_optim.param_groups[0]['lr'] stats['critic_lr'] = self.critic_optim.param_groups[0]['lr'] stats['time_elapsed'] = tnow - tfirststart for name, value in stats.items(): logger.logkv(name, value) logger.dumpkvs() if (epoch == 0 or epoch >= self.warmup_iter) and \ self.save_interval and\ epoch % self.save_interval == 0 and \ logger.get_dir(): mean_final_dist, succ_rate = self.rollout() logger.logkv('epoch', epoch) logger.logkv('test/total_rollout_steps', total_rollout_steps) logger.logkv('test/mean_final_dist', mean_final_dist) logger.logkv('test/succ_rate', succ_rate) tra_mean_dist, tra_succ_rate = self.rollout(train_test=True) logger.logkv('train/mean_final_dist', tra_mean_dist) logger.logkv('train/succ_rate', tra_succ_rate) # self.log_model_weights() logger.dumpkvs() if mean_final_dist < self.closest_dist: self.closest_dist = mean_final_dist is_best = True else: is_best = False self.save_model(is_best=is_best, step=self.global_step)
def train(agent, env, N_steps, N_updates, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, epsilon=1e-5, alpha=0.95, log_interval=10, batch_size=4, N_train_sample_epochs=4, cliprange=0.2, save_interval=0): if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) runner = Runner(env, agent, nsteps=N_steps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) tfirststart = time.time() for update in range(1, N_updates + 1): obs, returns, dones, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 epinfobuf.extend(epinfos) frac = 1.0 - (update - 1.0) / N_updates lrnow = lr(frac) cliprangenow = cliprange(frac) optimazer = optim.RMSprop(agent.parameters(), lr=lrnow, weight_decay=alpha, eps=epsilon) mblossnames = ['policy_loss', 'value_loss', 'entropy'] mblossvals = [] agent.train() obs_ = torch.tensor(obs, requires_grad=True).float() returns_ = torch.tensor(returns).float() actions_ = torch.tensor(actions).float() values_ = torch.tensor(values).float() neglogpacs_ = torch.tensor(neglogpacs).float() advs_ = returns_ - values_ optimazer.zero_grad() neglogp, entropy, vpred = agent.statistics(obs_, actions_) entropy = torch.mean(entropy) vf_loss = torch.mean(0.5 * (vpred - returns_)**2) pg_loss = torch.mean(advs_ * neglogp) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef loss.backward() optimazer.step() mblossvals.append([pg_loss.item(), vf_loss.item(), entropy.item()]) lossvals = np.mean(mblossvals, axis=0) tnow = time.time() N_sample_steps = obs.shape[0] fps = int(update * N_sample_steps / (tnow - tfirststart)) if update % log_interval == 0 or update == 1: ev = explained_variance(values, returns) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * N_sample_steps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, mblossnames): logger.logkv(lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): checkdir = os.path.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = os.path.join(checkdir, '%.5i' % update) print('Saving to', savepath) torch.save(agent.state_dict(), savepath) env.close() return agent
def train(self): epinfobuf = deque(maxlen=20) tfirststart = time.time() for update in range(self.num_iters): tstart = time.time() res = self.runner.run() obs, returns, dones, actions, values, acts_neglog, epinfos = res if self.ob_rms: self.model.ob_rms.update(obs) epinfobuf.extend(epinfos) lossvals = { 'policy_loss': [], 'value_loss': [], 'policy_entropy': [], 'approxkl': [], 'clipfrac': [] } inds = np.arange(self.nbatch) for _ in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.nbatch, self.nbatch_train): end = start + self.nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, actions, returns, acts_neglog, values)) info = self.model.train(*slices) lossvals['policy_loss'].append(info['pg_loss']) lossvals['value_loss'].append(info['vf_loss']) lossvals['policy_entropy'].append(info['entropy']) lossvals['approxkl'].append(info['approxkl']) lossvals['clipfrac'].append(info['clipfrac']) tnow = time.time() fps = int(self.nbatch / (tnow - tstart)) if update % self.log_interval == 0: ev = explained_variance(values, returns) logger.logkv("Learning rate", self.model.optimizer.param_groups[0]['lr']) logger.logkv("serial_timesteps", update * self.nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * self.nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv( 'eprewmean', safemean([epinfo['reward'] for epinfo in epinfobuf])) logger.logkv( 'eplenmean', safemean([epinfo['steps'] for epinfo in epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) for name, value in lossvals.items(): logger.logkv(name, np.mean(value)) logger.dumpkvs() if self.save_interval and \ update % self.save_interval == 0 and \ logger.get_dir(): self.model.log_model_weights() avg_steps, avg_reward = self.runner.test() logger.logkv("nupdates", update) logger.logkv("test/total_timesteps", update * self.nbatch) logger.logkv('test/step', avg_steps) logger.logkv('test/reward', avg_reward) if not self.with_embed: res = self.runner.test(train=True) train_avg_steps, train_avg_reward = res logger.logkv('train/step', train_avg_steps) logger.logkv('train/reward', train_avg_reward) logger.dumpkvs() if avg_reward > self.model.best_rewards: self.model.best_rewards = avg_reward is_best = True else: is_best = False self.model.save_model(is_best=is_best, step=update) self.env.close()