class Trainer(object): def __init__(self, make_env, hps, num_timesteps, num_env): self.make_env = make_env self.hps = hps self.num_env = num_env self.num_timesteps = num_timesteps self._set_env_vars() self.policy = CnnPolicy(hps=hps, scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu) self.feature_extractor = { "none": FeatureExtractor, "idf": InverseDynamics, "rnd": RandomNetworkDistillation, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels }[hps['policy']] self.feature_extractor = self.feature_extractor( policy=self.policy, features_shared_with_policy=False, feat_dim=512, layernormalize=hps['layernorm']) if hps['policy'] == 'rnd': self.dynamics = RNDDyn elif hps['policy'] == 'pix2pix': self.dynamics = UNet else: self.dynamics = Dynamics self.dynamics = self.dynamics( auxiliary_task=self.feature_extractor, predict_from_pixels=hps['dyn_from_pixels'], feat_dim=512) self.agent = PpoOptimizer(hps=hps, scope='ppo', ob_space=self.ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps['use_news'], gamma=hps['gamma'], gamma_ext=hps['gamma_ext'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_env'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], dynamics=self.dynamics) if hps['policy'] != 'rnd': self.agent.to_report['aux'] = tf.reduce_mean( self.feature_extractor.loss) self.agent.total_loss += self.agent.to_report['aux'] self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss) self.agent.total_loss += self.agent.to_report['dyn_loss'] self.agent.to_report['feat_var'] = tf.reduce_mean( tf.nn.moments(self.feature_extractor.features, [0, 1])[1]) def _set_env_vars(self): env = self.make_env(0, add_monitor=False) self.ob_space, self.ac_space = env.observation_space, env.action_space self.ob_mean, self.ob_std = random_agent_ob_mean_std(env, nsteps=10000) del env self.envs = [ functools.partial(self.make_env, i) for i in range(self.num_env) ] def train(self, args): self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics) sess = tf.get_default_session() self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) checkdir = osp.join(logger.get_dir(), 'checkpoints', args['env'] + '-' + args['policy']) os.makedirs(checkdir, exist_ok=True) load_weights = args['load_weights'] start_nupdates = 0 if load_weights is not None: load_path = osp.join(checkdir, load_weights) start_nupdates = int(load_weights) print('Loading checkpoint from %s ' % load_weights) self.load(load_path) while True: info = self.agent.step() if info['update']: print('task = ', args['env'], ' num_env = ', args['num_env'], ' policy = ', args['policy']) info['update']['n_updates'] += start_nupdates info['update']['tcount'] += start_nupdates * args[ 'nsteps_per_env'] * args['num_env'] logger.logkvs(info['update']) logger.dumpkvs() print('Time elapsed ' + str( datetime.timedelta(seconds=info['update']['total_secs']))) if info['update']['n_updates'] % 10 == 0 or info['update'][ 'n_updates'] == 1: weights_index = info['update']['n_updates'] savepath = osp.join(checkdir, '%.5i' % weights_index) print('Saving to', savepath) self.save(savepath) if self.agent.rollout.stats['tcount'] > self.num_timesteps: break self.agent.stop_interaction()
class Trainer(object): def __init__(self, make_env, hps, num_timesteps, envs_per_process): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps self._set_env_vars() self.policy = CnnPolicy( scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu) self.feature_extractor = {"none": FeatureExtractor, "idf": InverseDynamics, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels}[hps['feat_learning']] self.feature_extractor = self.feature_extractor(policy=self.policy, features_shared_with_policy=False, feat_dim=512, layernormalize=hps['layernorm']) self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet self.dynamics = self.dynamics(auxiliary_task=self.feature_extractor, predict_from_pixels=hps['dyn_from_pixels'], feat_dim=512) self.agent = PpoOptimizer( scope='ppo', ob_space=self.ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps['use_news'], gamma=hps['gamma'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_seg'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], dynamics=self.dynamics ) self.agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss) self.agent.total_loss += self.agent.to_report['aux'] self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss) self.agent.total_loss += self.agent.to_report['dyn_loss'] self.agent.to_report['feat_var'] = tf.reduce_mean(tf.nn.moments(self.feature_extractor.features, [0, 1])[1]) def _set_env_vars(self): env = self.make_env(0, add_monitor=False) self.ob_space, self.ac_space = env.observation_space, env.action_space self.ob_mean, self.ob_std = random_agent_ob_mean_std(env) del env self.envs = [functools.partial(self.make_env, i) for i in range(self.envs_per_process)] def train(self): self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics) while True: info = self.agent.step() if info['update']: logger.logkvs(info['update']) logger.dumpkvs() if self.agent.rollout.stats['tcount'] > self.num_timesteps: break self.agent.stop_interaction()
class Trainer(object): def __init__(self, make_env, hps, num_timesteps, envs_per_process): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps self._set_env_vars() self.policy = CnnPolicy( scope="pol", ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu, ) self.feature_extractor = { "none": FeatureExtractor, "idf": InverseDynamics, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels, }[hps["feat_learning"]] self.feature_extractor = self.feature_extractor( policy=self.policy, features_shared_with_policy=False, feat_dim=512, layernormalize=hps["layernorm"], ) self.dynamics = Dynamics if hps["feat_learning"] != "pix2pix" else UNet self.dynamics = self.dynamics( auxiliary_task=self.feature_extractor, predict_from_pixels=hps["dyn_from_pixels"], feat_dim=512, ama=hps["ama"], uncertainty_penalty=hps["uncertainty_penalty"], clip_ama=hps["clip_ama"], clip_val=hps["clip_val"], reward_scaling=hps["reward_scaling"], abs_ama=hps["abs_ama"]) self.agent = PpoOptimizer( scope="ppo", ob_space=self.ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps["use_news"], gamma=hps["gamma"], lam=hps["lambda"], nepochs=hps["nepochs"], nminibatches=hps["nminibatches"], lr=hps["lr"], cliprange=0.1, nsteps_per_seg=hps["nsteps_per_seg"], nsegs_per_env=hps["nsegs_per_env"], ent_coef=hps["ent_coeff"], normrew=hps["norm_rew"], normadv=hps["norm_adv"], ext_coeff=hps["ext_coeff"], int_coeff=hps["int_coeff"], dynamics=self.dynamics, args=hps, ) self.agent.to_report["aux"] = tf.reduce_mean( self.feature_extractor.loss) self.agent.total_loss += self.agent.to_report["aux"] self.agent.to_report["dyn_loss"] = tf.reduce_mean( self.dynamics.loss[0]) self.agent.total_loss += self.agent.to_report["dyn_loss"] self.agent.to_report["feat_var"] = tf.reduce_mean( tf.nn.moments(self.feature_extractor.features, [0, 1])[1]) def _set_env_vars(self): env = self.make_env(0, add_monitor=False) self.ob_space, self.ac_space = env.observation_space, env.action_space self.ob_mean, self.ob_std = random_agent_ob_mean_std(env) del env self.envs = [ functools.partial(self.make_env, i) for i in range(self.envs_per_process) ] def train(self): import random self.agent.start_interaction(self.envs, nlump=self.hps["nlumps"], dynamics=self.dynamics) count = 0 while True: count += 1 info = self.agent.step() if info["update"]: logger.logkvs(info["update"]) logger.dumpkvs() if self.hps["feat_learning"] == "pix2pix": making_video = random.choice(99 * [False] + [True]) else: making_video = False self.agent.rollout.making_video = making_video for a_key in info.keys(): wandb.log(info[a_key]) wandb.log( {"average_sigma": np.mean(self.agent.rollout.buf_sigmas)}) # going to have to log it here if self.agent.rollout.stats["tcount"] > self.num_timesteps: break self.agent.stop_interaction()
class Trainer(object): def __init__(self, make_env, hps, num_timesteps, envs_per_process): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps self.save_interval = hps['save_interval'] self._set_env_vars() self.policy = CnnPolicy(scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu) # add policy to collections tf.add_to_collection('policy', self.policy) self.feature_extractor = { "none": FeatureExtractor, "idf": InverseDynamics, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels }[hps['feat_learning']] self.feature_extractor = self.feature_extractor( policy=self.policy, features_shared_with_policy=False, feat_dim=512, layernormalize=hps['layernorm']) self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet self.dynamics = self.dynamics( auxiliary_task=self.feature_extractor, predict_from_pixels=hps['dyn_from_pixels'], feat_dim=512) self.agent = PpoOptimizer(scope='ppo', ob_space=self.ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps['use_news'], gamma=hps['gamma'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_seg'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], dynamics=self.dynamics, n_eval_steps=hps['n_eval_steps']) # add policy to collections tf.add_to_collection('agent', self.agent) self.agent.to_report['aux'] = tf.reduce_mean( self.feature_extractor.loss) self.agent.total_loss += self.agent.to_report['aux'] self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss) self.agent.total_loss += self.agent.to_report['dyn_loss'] self.agent.to_report['feat_var'] = tf.reduce_mean( tf.nn.moments(self.feature_extractor.features, [0, 1])[1]) def _set_env_vars(self): env = self.make_env(0, add_monitor=False) self.ob_space, self.ac_space = env.observation_space, env.action_space self.ob_mean, self.ob_std = random_agent_ob_mean_std(env) del env self.envs = [ functools.partial(self.make_env, i) for i in range(self.envs_per_process) ] def train(self, saver, sess, restore=False): self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics) write_meta_graph = False saves = 0 loops = 0 while True: info = self.agent.step(eval=False) if info is not None: if info['update'] and not restore: logger.logkvs(info['update']) logger.dumpkvs() steps = self.agent.rollout.stats['tcount'] if loops % 10 == 0: filename = args.saved_model_dir + 'model.ckpt' saver.save(sess, filename, global_step=int(saves), write_meta_graph=False) saves += 1 loops += 1 if steps > self.num_timesteps: break self.agent.stop_interaction() def test(self, saver, sess): self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics) print('loading model') saver.restore(sess, args.saved_model_dir + args.model_name) print('loaded model,', args.saved_model_dir + args.model_name) include_images = args.include_images and eval info = self.agent.step(eval=True, include_images=include_images) if info['update']: logger.logkvs(info['update']) logger.dumpkvs() # save actions, news, and / or images np.save(args.env + '_data.npy', info) print('EVALUATION COMPLETED') print('SAVED DATA IN CURRENT DIRECTORY') print('FILENAME', args.env + '_data.npy') self.agent.stop_interaction()
class Trainer(object): def __init__(self, make_env, hps, num_timesteps, envs_per_process): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps self._set_env_vars( ) # 初始化 ob_space,ac_space,ob_mean,ob_std, 初始化 self.envs 包含多个环境模型 self.policy = CnnPolicy(scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu) # 在建立环境模型之前, 先初始特征提取器. 定义在 auxiliary_task.py 中. 其中 pix2pix 相当于没有提取特征. self.feature_extractor = { "none": FeatureExtractor, # 默认是none "idf": InverseDynamics, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels }[hps['feat_learning']] # 通过hps参数选择一个特征提取器 self.feature_extractor = self.feature_extractor( policy=self.policy, features_shared_with_policy=False, feat_dim=512, layernormalize=hps['layernorm']) # 初始化 环境模型 的类. 上述定义的 feature_extractor 将作为一个参数传入 self.dynamics = DvaeDynamics(auxiliary_task=self.feature_extractor, reward_type=hps['reward_type'], sample_seeds=hps['sample_seeds']) self.agent = PpoOptimizer( scope='ppo', ob_space=self.ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps['use_news'], gamma=hps['gamma'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_seg'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], dynamics=self.dynamics, # dynamic 对象 nepochs_dvae=hps["nepochs_dvae"] # 额外训练 dynamic 的次数 ) # agent 损失: 包括 actor,critic,entropy 损失; 先在加上 feature 学习时包含的损失 self.agent.to_report['aux'] = tf.reduce_mean( self.feature_extractor.loss) self.agent.total_loss += self.agent.to_report['aux'] # dynamic 损失 self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss) self.agent.total_loss += self.agent.to_report['dyn_loss'] # add bai. 单独记录DAVE损失,可能要多次训练DAVE self.agent.dynamics_loss = self.agent.to_report['dyn_loss'] # 计算状态经过辅助任务提取特征的方差, shape=(512,), 下面取 tf.reduce_mean 后是一个标量 self.agent.to_report['feat_var'] = tf.reduce_mean( tf.nn.moments(self.feature_extractor.features, [0, 1])[1]) def _set_env_vars(self): """ 该 env 仅是为了初始化 ob_space, ac_space, ob_mean, ob_std. 因此在算完之后 del 掉. 随后初始化 self.envs_per_process 个 env """ env = self.make_env(0, add_monitor=False) # ob_space.shape=(84, 84, 4) ac_space.shape=Discrete(4) self.ob_space, self.ac_space = env.observation_space, env.action_space # 随机智能体与环境交互, 计算观测的均值和标准差. ob_mean.shape=(84,84,4), 是0-255之间的数. ob_std是标量, breakout中为 1.8 self.ob_mean, self.ob_std = random_agent_ob_mean_std(env) del env self.envs = [ functools.partial(self.make_env, i) for i in range(self.envs_per_process) ] def train(self, saver, logger_dir): # 初始化计算图, 初始化 rollout 类 self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics) previous_saved_tcount = 0 while True: info = self.agent.step() # 与环境交互一个周期, 收集样本, 计算内在激励, 并训练 if info['update']: logger.logkvs(info['update']) logger.dumpkvs() if self.hps["save_period"] and (int( self.agent.rollout.stats['tcount'] / self.hps["save_freq"]) > previous_saved_tcount): previous_saved_tcount += 1 save_path = saver.save( tf.get_default_session(), os.path.join( logger_dir, "model_" + str(previous_saved_tcount) + ".ckpt")) print("Periodically model saved in path:", save_path) if self.agent.rollout.stats['tcount'] > self.num_timesteps: save_path = saver.save( tf.get_default_session(), os.path.join(logger_dir, "model_last.ckpt")) print("Model saved in path:", save_path) break self.agent.stop_interaction()
class Trainer(object): from baselines import logger def __init__(self, make_env, hps, num_timesteps, envs_per_process, exp_name=None, env_name=None, policy=None, feat_ext=None, dyn=None, agent_num=None, restore_name=None): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.depth_pred = hps['depth_pred'] self.aux_input = hps['aux_input'] self.num_timesteps = num_timesteps self._set_env_vars() if exp_name: self.exp_name = exp_name else: self.exp_name = hps['exp_name'] if env_name: self.env_name = env_name else: self.env_name = hps['env'] if policy is None: if hps['lstm']: self.policy = LSTMPolicy( scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, batchsize=hps['envs_per_process'], feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, lstm1_size=hps['lstm1_size'], lstm2_size=hps['lstm2_size'], layernormalize=False, nl=tf.nn.leaky_relu, depth_pred=hps['depth_pred'], aux_input=hps['aux_input'], ) else: self.policy = CnnPolicy( scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu ) else: self.policy = policy self.policy.restore() if feat_ext: self.feature_extractor = feat_ext self.feature_extractor.restore() else: self.feature_extractor = {"none": FeatureExtractor, "idf": InverseDynamics, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels}[hps['feat_learning']] self.feature_extractor = self.feature_extractor(policy=self.policy, features_shared_with_policy=hps['feat_share'], feat_dim=hps['dyn_feat_dim'], layernormalize=hps['layernorm']) if dyn: self.dynamics = dyn self.dynamics.restore() else: self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet self.dynamics = self.dynamics(auxiliary_task=self.feature_extractor, predict_from_pixels=hps['dyn_from_pixels'], feat_dim=hps['dyn_feat_dim']) self.agent = PpoOptimizer( hps=hps, scope='ppo', ob_space=self.ob_space, env_ob_space=self.env_ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps['use_news'], gamma=hps['gamma'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_seg'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], dynamics=self.dynamics, exp_name=self.exp_name, env_name=self.env_name, video_log_freq=hps['video_log_freq'], model_save_freq=hps['model_save_freq'], use_apples=hps['use_apples'], agent_num=agent_num, restore_name=restore_name, multi_envs=hps['multi_train_envs'], lstm=hps['lstm'], lstm1_size=hps['lstm1_size'], lstm2_size=hps['lstm2_size'], depth_pred=hps['depth_pred'], aux_input=hps['aux_input'], beta_d=hps['beta'], early_stop=hps['early_stop'], optim=hps['optim'], decay=hps['decay'], grad_clip=hps['grad_clip'], log_grads=hps['log_grads'], logdir=hps['logdir'] ) self.agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss) self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss) self.agent.to_report['feat_var'] = tf.reduce_mean(tf.nn.moments(self.feature_extractor.features, [0,1])[1]) if hps['curiosity']: #self.agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss) self.agent.total_loss += hps['aux_loss_coeff']*self.agent.to_report['aux'] #self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss) self.agent.total_loss += hps['dyn_loss_coeff']*self.agent.to_report['dyn_loss'] #self.agent.to_report['feat_var'] = tf.reduce_mean(tf.nn.moments(self.feature_extractor.features, [0, 1])[1]) def _set_env_vars(self): import numpy as np env = self.make_env(0, add_monitor=False) self.ob_space, self.ac_space = env.observation_space, env.action_space self.env_ob_space = env.observation_space if self.depth_pred: self.ob_space = gym.spaces.Box(0, 255, shape=(84,84,3), dtype=np.uint8) self.ob_mean, self.ob_std = random_agent_ob_mean_std(env, depth_pred=self.hps['depth_pred']) del env self.envs = [functools.partial(self.make_env, i) for i in range(self.envs_per_process)] def train(self, saver, sess, restore=False): from baselines import logger self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics) if restore: print("Restoring model for training") saver.restore(sess, "models/" + self.hps['restore_model'] + ".ckpt") print("Loaded model", self.hps['restore_model']) write_meta_graph = False while True: info = self.agent.step() if info['update']: if info['update']['recent_best_ext_ret'] is None: info['update']['recent_best_ext_ret'] = 0 wandb.log(info['update']) logger.logkvs(info['update']) logger.dumpkvs() if self.agent.rollout.stats['tcount'] > self.num_timesteps: break if self.hps['tune_env']: filename = "models/" + self.hps['restore_model'] + "_tune_on_" + self.hps['tune_env'] + "_final.ckpt" else: filename = "models/" + self.hps['exp_name'] + "_final.ckpt" saver.save(sess, filename, write_meta_graph=False) self.policy.save_model(self.hps['exp_name'], 'final') self.agent.stop_interaction()
class Trainer(object): def __init__(self, make_env, hps, num_timesteps, envs_per_process): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps self.save_checkpoint = hps['save_checkpoint'] self._set_env_vars() self.policy = CnnPolicy(scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu) self.feature_extractor = { "none": FeatureExtractor, "idf": InverseDynamics, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels, "flowS": OpticalFlowFeatureExtractor, "flowC": OpticalFlowFeatureExtractor }[hps['feat_learning']] if 'flow' in hps['feat_learning']: self.feature_extractor = self.feature_extractor( policy=self.policy, FICM_type=hps['feat_learning'], fix_features=hps['fix_features']) self.dynamics = FlowDynamics(auxiliary_task=self.feature_extractor, FICM_type=hps['feat_learning']) else: self.feature_extractor = self.feature_extractor( policy=self.policy, features_shared_with_policy=False, feat_dim=512, layernormalize=hps['layernorm']) self.dynamics = Dynamics if hps[ 'feat_learning'] != 'pix2pix' else UNet self.dynamics = self.dynamics( auxiliary_task=self.feature_extractor, predict_from_pixels=hps['dyn_from_pixels'], feat_dim=512) self.agent = PpoOptimizer(scope='ppo', ob_space=self.ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps['use_news'], gamma=hps['gamma'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_seg'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], dynamics=self.dynamics, flow_lr=hps['flow_lr'], update_periods=hps['update_periods']) self.agent.to_report['aux'] = tf.reduce_mean( self.feature_extractor.loss) self.agent.total_loss += self.agent.to_report['aux'] self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss) self.agent.total_loss += self.agent.to_report['dyn_loss'] if 'flow' not in hps['feat_learning']: self.agent.to_report['feat_var'] = tf.reduce_mean( tf.nn.moments(self.feature_extractor.features, [0, 1])[1]) def _set_env_vars(self): env = self.make_env(0, add_monitor=False) self.ob_space, self.ac_space = env.observation_space, env.action_space self.ob_mean, self.ob_std = random_agent_ob_mean_std(env) del env self.envs = [ functools.partial(self.make_env, i) for i in range(self.envs_per_process) ] def save(self, saver, save_dir, step): model_name = 'model.ckpt' checkpoint_path = os.path.join(save_dir, model_name) if not os.path.exists(save_dir): os.makedirs(save_dir) saver.save(tf.get_default_session(), checkpoint_path, global_step=step) print('The checkpoint has been created, step: {}'.format(step)) def train(self): if self.save_checkpoint: params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) saver = tf.train.Saver(var_list=params, max_to_keep=self.num_timesteps // 1000000 + 1) periods = list(range(0, self.num_timesteps + 1, 1000000)) idx = 0 self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics) while True: info = self.agent.step() if info['update']: logger.logkvs(info['update']) logger.dumpkvs() if self.save_checkpoint: if self.agent.rollout.stats['tcount'] >= periods[idx]: self.save(saver, logger.get_dir() + '/checkpoint/', periods[idx]) idx += 1 if self.agent.rollout.stats['tcount'] > self.num_timesteps: break self.agent.stop_interaction()
class Trainer(object): def __init__(self, make_env, hps, num_timesteps, envs_per_process): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps self._set_env_vars() self.policy = CnnPolicy(scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu) self.feature_extractor = { "none": FeatureExtractor, "idf": InverseDynamics, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels }[hps['feat_learning']] self.feature_extractor = self.feature_extractor( policy=self.policy, features_shared_with_policy=False, feat_dim=512, layernormalize=hps['layernorm']) self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet self.dynamics = self.dynamics( auxiliary_task=self.feature_extractor, predict_from_pixels=hps['dyn_from_pixels'], feat_dim=512) self.agent = PpoOptimizer(scope='ppo', ob_space=self.ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps['use_news'], gamma=hps['gamma'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_seg'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], dynamics=self.dynamics) self.agent.to_report['aux'] = tf.reduce_mean( self.feature_extractor.loss) self.agent.total_loss += self.agent.to_report['aux'] self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss) self.agent.total_loss += self.agent.to_report['dyn_loss'] self.agent.to_report['feat_var'] = tf.reduce_mean( tf.nn.moments(self.feature_extractor.features, [0, 1])[1]) def _set_env_vars(self): from time import sleep env = self.make_env(0, add_monitor=False) self.ob_space, self.ac_space = env.observation_space, env.action_space self.ob_mean, self.ob_std = random_agent_ob_mean_std(env) env.close() print("Waiting for 1 minute to make sure socket is closed on Linux") sleep(60) del env self.envs = [ functools.partial(self.make_env, i) for i in range(self.envs_per_process) ] def train(self): self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics) save_path = 'models' tf_sess = tf.get_default_session() # Create a saver. saver = tf.train.Saver(save_relative_paths=True) # if self.hps['restore_latest_checkpoint']: # Restore latest checkpoint if set in arguments # saver.restore(tf_sess, tf.train.latest_checkpoint(save_path)) while True: info = self.agent.step() if info['update']: logger.logkvs(info['update']) logger.dumpkvs() if self.agent.rollout.stat['tcount'] > self.num_timesteps: break # Saving the model every 1,000 steps. if info['n_updates'] % 1000 == 0: # Append the step number to the checkpoint name: saver.save(tf_sess, save_path + '/obstacle_tower', global_step=int(self.agent.rollout.stats['tcount'])) # Append the step number to the last checkpoint name: saver.save(tf_sess, save_path + '/obstacle_tower', global_step=int(self.agent.rollout.stats['tcount'])) self.agent.stop_interaction()
class Scorer(object): def __init__(self, make_env, hps, num_timesteps, envs_per_process): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps # self._set_env_vars() self.ob_mean, self.ob_std, self.ob_space, self.ac_space = random_agent_ob_mean_std( None, hps['env'], nsteps=1, load=True) # env = self.make_env(256, add_monitor=False, sleep_multiple=1./32) # self.ob_space, self.ac_space = env.observation_space, env.action_space # env.close() # del env self.envs = [ functools.partial(self.make_env, i + 256 + 1) for i in range(envs_per_process) ] self.policy = CnnPolicy(scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu) self.feature_extractor = { "none": FeatureExtractor, "idf": InverseDynamics, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels }[hps['feat_learning']] self.feature_extractor = self.feature_extractor( policy=self.policy, features_shared_with_policy=False, feat_dim=512, layernormalize=hps['layernorm']) self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet self.dynamics = self.dynamics( auxiliary_task=self.feature_extractor, predict_from_pixels=hps['dyn_from_pixels'], feat_dim=512) self.agent = PpoOptimizer( scope='ppo', ob_space=self.ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps['use_news'], gamma=hps['gamma'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_seg'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], dynamics=self.dynamics, load=hps['load'], exp_name=hps['exp_name'], ) self.agent.to_report['aux'] = tf.reduce_mean( self.feature_extractor.loss) self.agent.total_loss += self.agent.to_report['aux'] self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss) self.agent.total_loss += self.agent.to_report['dyn_loss'] self.agent.to_report['feat_var'] = tf.reduce_mean( tf.nn.moments(self.feature_extractor.features, [0, 1])[1]) def score(self): self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics) from time import sleep sleep(2) episode_reward = 0 episode_rewards = [] total_episodes = 0 max_level = 0 max_levels = [] while True: # info = self.agent.step() # self.agent.rollout.collect_rollout() obs, prevrews, news, infos = self.agent.rollout.env_get(0) if prevrews is not None: episode_reward += prevrews if prevrews == 1: max_level += 1 if news: episode_rewards.append(episode_reward) ave_reward = sum(episode_rewards) / len(episode_rewards) total_episodes += 1 max_levels.append(max_level) ave_level = sum(max_levels) / len(max_levels) ave_level = np.around(ave_level, 2) ave_reward = np.around(ave_reward, 2) print('ep:', total_episodes, 'level:', max_level, 'ave_level:', ave_level, 'episode_reward:', episode_reward, 'ave_reward', ave_reward) episode_reward = 0 max_level = 0 if total_episodes >= 25: break # acs, vpreds, nlps = self.agent.rollout.policy.get_ac_value_nlp(obs) # self.agent.rollout.env_step(0, acs) acs, vpreds, nlps = self.policy.get_ac_value_nlp(obs) self.agent.rollout.env_step(0, acs) self.agent.rollout.step_count += 1 self.agent.stop_interaction()