예제 #1
0
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self._set_env_vars()

        self.policy = CnnPolicy(
            scope='pol',
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            hidsize=512,
            feat_dim=512,
            ob_mean=self.ob_mean,
            ob_std=self.ob_std,
            layernormalize=False,
            nl=tf.nn.leaky_relu)

        self.feature_extractor = {"none": FeatureExtractor,
                                  "idf": InverseDynamics,
                                  "vaesph": partial(VAE, spherical_obs=True),
                                  "vaenonsph": partial(VAE, spherical_obs=False),
                                  "pix2pix": JustPixels}[hps['feat_learning']]
        self.feature_extractor = self.feature_extractor(policy=self.policy,
                                                        features_shared_with_policy=False,
                                                        feat_dim=512,
                                                        layernormalize=hps['layernorm'])

        self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet
        self.dynamics = self.dynamics(auxiliary_task=self.feature_extractor,
                                      predict_from_pixels=hps['dyn_from_pixels'],
                                      feat_dim=512)

        self.agent = PpoOptimizer(
            scope='ppo',
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            stochpol=self.policy,
            use_news=hps['use_news'],
            gamma=hps['gamma'],
            lam=hps["lambda"],
            nepochs=hps['nepochs'],
            nminibatches=hps['nminibatches'],
            lr=hps['lr'],
            cliprange=0.1,
            nsteps_per_seg=hps['nsteps_per_seg'],
            nsegs_per_env=hps['nsegs_per_env'],
            ent_coef=hps['ent_coeff'],
            normrew=hps['norm_rew'],
            normadv=hps['norm_adv'],
            ext_coeff=hps['ext_coeff'],
            int_coeff=hps['int_coeff'],
            dynamics=self.dynamics
        )

        self.agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report['aux']
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        self.agent.total_loss += self.agent.to_report['dyn_loss']
        self.agent.to_report['feat_var'] = tf.reduce_mean(tf.nn.moments(self.feature_extractor.features, [0, 1])[1])
예제 #2
0
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self._set_env_vars()

        self.policy = CnnPolicy(scope='pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=512,
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=torch.nn.LeakyReLU)

        self.feature_extractor = {
            "none": FeatureExtractor,
            "idf": InverseDynamics,
            "vaesph": partial(VAE, spherical_obs=True),
            "vaenonsph": partial(VAE, spherical_obs=False),
            "pix2pix": JustPixels
        }[hps['feat_learning']]
        self.feature_extractor = self.feature_extractor(
            policy=self.policy,
            # if we use VAE, 'features_shared_with_policy' should be set to False,
            # because the shape of output_features of VAE.get_features is feat_dims * 2, including means and stds,
            # but the shape of out_features of policy.get_features is feat_dims,
            # only means is used as features exposed to dynamics
            features_shared_with_policy=False,
            feat_dim=512,
            layernormalize=hps['layernorm'])

        self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet
        self.dynamics = self.dynamics(
            auxiliary_task=self.feature_extractor,
            predict_from_pixels=hps['dyn_from_pixels'],
            feat_dim=512)

        self.agent = PpoOptimizer(scope='ppo',
                                  ob_space=self.ob_space,
                                  ac_space=self.ac_space,
                                  stochpol=self.policy,
                                  use_news=hps['use_news'],
                                  gamma=hps['gamma'],
                                  lam=hps["lambda"],
                                  nepochs=hps['nepochs'],
                                  nminibatches=hps['nminibatches'],
                                  lr=hps['lr'],
                                  cliprange=0.1,
                                  nsteps_per_seg=hps['nsteps_per_seg'],
                                  nsegs_per_env=hps['nsegs_per_env'],
                                  ent_coef=hps['ent_coeff'],
                                  normrew=hps['norm_rew'],
                                  normadv=hps['norm_adv'],
                                  ext_coeff=hps['ext_coeff'],
                                  int_coeff=hps['int_coeff'],
                                  dynamics=self.dynamics)
예제 #3
0
파일: play.py 프로젝트: Baichenjia/VDM
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self._set_env_vars()    # 初始化 ob_space,ac_space,ob_mean,ob_std, 初始化 self.envs 包含多个环境模型

        self.policy = CnnPolicy(scope='pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=512,
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=tf.nn.leaky_relu)

        self.feature_extractor = FeatureExtractor(policy=self.policy,
                                                  features_shared_with_policy=False,
                                                  feat_dim=512,
                                                  layernormalize=hps['layernorm'])

        # 初始化 环境模型 的类. 上述定义的 feature_extractor 将作为一个参数传入
        self.dynamics = DvaeDynamics(auxiliary_task=self.feature_extractor,
                                     reward_type=hps['reward_type'])

        self.agent = PpoOptimizer(
            scope='ppo',
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            stochpol=self.policy,
            use_news=hps['use_news'],
            gamma=hps['gamma'],
            lam=hps["lambda"],
            nepochs=hps['nepochs'],
            nminibatches=hps['nminibatches'],
            lr=hps['lr'],
            cliprange=0.1,
            nsteps_per_seg=hps['nsteps_per_seg'],
            nsegs_per_env=hps['nsegs_per_env'],
            ent_coef=hps['ent_coeff'],
            normrew=hps['norm_rew'],
            normadv=hps['norm_adv'],
            ext_coeff=hps['ext_coeff'],
            int_coeff=hps['int_coeff'],
            dynamics=self.dynamics,
            nepochs_dvae=0
        )

        # agent 损失: 包括 actor,critic,entropy 损失; 先在加上 feature 学习时包含的损失
        self.agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report['aux']

        # dynamic 损失,  将所有 dynamic 的损失累加起来
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        self.agent.total_loss += self.agent.to_report['dyn_loss']

        # 计算状态经过辅助任务提取特征的方差, shape=(512,), 下面取 tf.reduce_mean 后是一个标量
        self.agent.to_report['feat_var'] = tf.reduce_mean(tf.nn.moments(self.feature_extractor.features, [0, 1])[1])
예제 #4
0
    def create_agent(self, exp_name, hps):
        # graph = tf.Graph()
        # graph.as_default()
        agent = PpoOptimizer(
            scope=exp_name,
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            stochpol=self.policy,
            use_news=hps['use_news'],
            gamma=hps['gamma'],
            lam=hps["lambda"],
            nepochs=hps['nepochs'],
            nminibatches=hps['nminibatches'],
            lr=hps['lr'],
            cliprange=0.1,
            nsteps_per_seg=hps['nsteps_per_seg'],
            nsegs_per_env=hps['nsegs_per_env'],
            ent_coef=hps['ent_coeff'],
            normrew=hps['norm_rew'],
            normadv=hps['norm_adv'],
            ext_coeff=hps['ext_coeff'],
            int_coeff=hps['int_coeff'],
            dynamics=self.dynamics,
            load=hps['load'],
            exp_name=exp_name,
        )

        agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss)
        agent.total_loss += agent.to_report['aux']
        agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        agent.total_loss += agent.to_report['dyn_loss']
        agent.to_report['feat_var'] = tf.reduce_mean(
            tf.nn.moments(self.feature_extractor.features, [0, 1])[1])

        # agent.graph = graph
        # tf.reset_default_graph()

        return agent
예제 #5
0
class Trainer(object):
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self._set_env_vars(
        )  # 初始化 ob_space,ac_space,ob_mean,ob_std, 初始化 self.envs 包含多个环境模型

        self.policy = CnnPolicy(scope='pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=512,
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=tf.nn.leaky_relu)

        # 在建立环境模型之前, 先初始特征提取器. 定义在 auxiliary_task.py 中. 其中 pix2pix 相当于没有提取特征.
        self.feature_extractor = {
            "none": FeatureExtractor,  # 默认是none
            "idf": InverseDynamics,
            "vaesph": partial(VAE, spherical_obs=True),
            "vaenonsph": partial(VAE, spherical_obs=False),
            "pix2pix": JustPixels
        }[hps['feat_learning']]  # 通过hps参数选择一个特征提取器
        self.feature_extractor = self.feature_extractor(
            policy=self.policy,
            features_shared_with_policy=False,
            feat_dim=512,
            layernormalize=hps['layernorm'])

        # 初始化 环境模型 的类. 上述定义的 feature_extractor 将作为一个参数传入
        self.dynamics = DvaeDynamics(auxiliary_task=self.feature_extractor,
                                     reward_type=hps['reward_type'],
                                     sample_seeds=hps['sample_seeds'])

        self.agent = PpoOptimizer(
            scope='ppo',
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            stochpol=self.policy,
            use_news=hps['use_news'],
            gamma=hps['gamma'],
            lam=hps["lambda"],
            nepochs=hps['nepochs'],
            nminibatches=hps['nminibatches'],
            lr=hps['lr'],
            cliprange=0.1,
            nsteps_per_seg=hps['nsteps_per_seg'],
            nsegs_per_env=hps['nsegs_per_env'],
            ent_coef=hps['ent_coeff'],
            normrew=hps['norm_rew'],
            normadv=hps['norm_adv'],
            ext_coeff=hps['ext_coeff'],
            int_coeff=hps['int_coeff'],
            dynamics=self.dynamics,  # dynamic 对象
            nepochs_dvae=hps["nepochs_dvae"]  # 额外训练 dynamic 的次数
        )

        # agent 损失: 包括 actor,critic,entropy 损失; 先在加上 feature 学习时包含的损失
        self.agent.to_report['aux'] = tf.reduce_mean(
            self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report['aux']

        # dynamic 损失
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        self.agent.total_loss += self.agent.to_report['dyn_loss']

        # add bai. 单独记录DAVE损失,可能要多次训练DAVE
        self.agent.dynamics_loss = self.agent.to_report['dyn_loss']

        # 计算状态经过辅助任务提取特征的方差, shape=(512,), 下面取 tf.reduce_mean 后是一个标量
        self.agent.to_report['feat_var'] = tf.reduce_mean(
            tf.nn.moments(self.feature_extractor.features, [0, 1])[1])

    def _set_env_vars(self):
        """
            该 env 仅是为了初始化 ob_space, ac_space, ob_mean, ob_std. 因此在算完之后 del 掉.
            随后初始化 self.envs_per_process 个 env
        """
        env = self.make_env(0, add_monitor=False)
        # ob_space.shape=(84, 84, 4)     ac_space.shape=Discrete(4)
        self.ob_space, self.ac_space = env.observation_space, env.action_space
        # 随机智能体与环境交互, 计算观测的均值和标准差. ob_mean.shape=(84,84,4), 是0-255之间的数. ob_std是标量, breakout中为 1.8
        self.ob_mean, self.ob_std = random_agent_ob_mean_std(env)
        del env
        self.envs = [
            functools.partial(self.make_env, i)
            for i in range(self.envs_per_process)
        ]

    def train(self, saver, logger_dir):
        # 初始化计算图, 初始化 rollout 类
        self.agent.start_interaction(self.envs,
                                     nlump=self.hps['nlumps'],
                                     dynamics=self.dynamics)
        previous_saved_tcount = 0
        while True:
            info = self.agent.step()  # 与环境交互一个周期, 收集样本, 计算内在激励, 并训练
            if info['update']:
                logger.logkvs(info['update'])
                logger.dumpkvs()
            if self.hps["save_period"] and (int(
                    self.agent.rollout.stats['tcount'] / self.hps["save_freq"])
                                            > previous_saved_tcount):
                previous_saved_tcount += 1
                save_path = saver.save(
                    tf.get_default_session(),
                    os.path.join(
                        logger_dir,
                        "model_" + str(previous_saved_tcount) + ".ckpt"))
                print("Periodically model saved in path:", save_path)
            if self.agent.rollout.stats['tcount'] > self.num_timesteps:
                save_path = saver.save(
                    tf.get_default_session(),
                    os.path.join(logger_dir, "model_last.ckpt"))
                print("Model saved in path:", save_path)
                break

        self.agent.stop_interaction()
예제 #6
0
class Trainer(object):
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self._set_env_vars()

        self.policy = CnnPolicy(
            scope='pol',
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            hidsize=512,
            feat_dim=512,
            ob_mean=self.ob_mean,
            ob_std=self.ob_std,
            layernormalize=False,
            nl=tf.nn.leaky_relu)

        self.feature_extractor = {"none": FeatureExtractor,
                                  "idf": InverseDynamics,
                                  "vaesph": partial(VAE, spherical_obs=True),
                                  "vaenonsph": partial(VAE, spherical_obs=False),
                                  "pix2pix": JustPixels}[hps['feat_learning']]
        self.feature_extractor = self.feature_extractor(policy=self.policy,
                                                        features_shared_with_policy=False,
                                                        feat_dim=512,
                                                        layernormalize=hps['layernorm'])

        self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet
        self.dynamics = self.dynamics(auxiliary_task=self.feature_extractor,
                                      predict_from_pixels=hps['dyn_from_pixels'],
                                      feat_dim=512)

        self.agent = PpoOptimizer(
            scope='ppo',
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            stochpol=self.policy,
            use_news=hps['use_news'],
            gamma=hps['gamma'],
            lam=hps["lambda"],
            nepochs=hps['nepochs'],
            nminibatches=hps['nminibatches'],
            lr=hps['lr'],
            cliprange=0.1,
            nsteps_per_seg=hps['nsteps_per_seg'],
            nsegs_per_env=hps['nsegs_per_env'],
            ent_coef=hps['ent_coeff'],
            normrew=hps['norm_rew'],
            normadv=hps['norm_adv'],
            ext_coeff=hps['ext_coeff'],
            int_coeff=hps['int_coeff'],
            dynamics=self.dynamics
        )

        self.agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report['aux']
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        self.agent.total_loss += self.agent.to_report['dyn_loss']
        self.agent.to_report['feat_var'] = tf.reduce_mean(tf.nn.moments(self.feature_extractor.features, [0, 1])[1])

    def _set_env_vars(self):
        env = self.make_env(0, add_monitor=False)
        self.ob_space, self.ac_space = env.observation_space, env.action_space
        self.ob_mean, self.ob_std = random_agent_ob_mean_std(env)
        del env
        self.envs = [functools.partial(self.make_env, i) for i in range(self.envs_per_process)]

    def train(self):
        self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics)
        while True:
            info = self.agent.step()
            if info['update']:
                logger.logkvs(info['update'])
                logger.dumpkvs()
            if self.agent.rollout.stats['tcount'] > self.num_timesteps:
                break

        self.agent.stop_interaction()
예제 #7
0
class Trainer(object):
    def __init__(self, make_env, hps, num_timesteps, num_env):
        self.make_env = make_env
        self.hps = hps
        self.num_env = num_env
        self.num_timesteps = num_timesteps
        self._set_env_vars()

        self.policy = CnnPolicy(hps=hps,
                                scope='pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=512,
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=tf.nn.leaky_relu)

        self.feature_extractor = {
            "none": FeatureExtractor,
            "idf": InverseDynamics,
            "rnd": RandomNetworkDistillation,
            "vaesph": partial(VAE, spherical_obs=True),
            "vaenonsph": partial(VAE, spherical_obs=False),
            "pix2pix": JustPixels
        }[hps['policy']]
        self.feature_extractor = self.feature_extractor(
            policy=self.policy,
            features_shared_with_policy=False,
            feat_dim=512,
            layernormalize=hps['layernorm'])

        if hps['policy'] == 'rnd':
            self.dynamics = RNDDyn
        elif hps['policy'] == 'pix2pix':
            self.dynamics = UNet
        else:
            self.dynamics = Dynamics

        self.dynamics = self.dynamics(
            auxiliary_task=self.feature_extractor,
            predict_from_pixels=hps['dyn_from_pixels'],
            feat_dim=512)

        self.agent = PpoOptimizer(hps=hps,
                                  scope='ppo',
                                  ob_space=self.ob_space,
                                  ac_space=self.ac_space,
                                  stochpol=self.policy,
                                  use_news=hps['use_news'],
                                  gamma=hps['gamma'],
                                  gamma_ext=hps['gamma_ext'],
                                  lam=hps["lambda"],
                                  nepochs=hps['nepochs'],
                                  nminibatches=hps['nminibatches'],
                                  lr=hps['lr'],
                                  cliprange=0.1,
                                  nsteps_per_seg=hps['nsteps_per_env'],
                                  nsegs_per_env=hps['nsegs_per_env'],
                                  ent_coef=hps['ent_coeff'],
                                  normrew=hps['norm_rew'],
                                  normadv=hps['norm_adv'],
                                  ext_coeff=hps['ext_coeff'],
                                  int_coeff=hps['int_coeff'],
                                  dynamics=self.dynamics)

        if hps['policy'] != 'rnd':
            self.agent.to_report['aux'] = tf.reduce_mean(
                self.feature_extractor.loss)
            self.agent.total_loss += self.agent.to_report['aux']
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        self.agent.total_loss += self.agent.to_report['dyn_loss']
        self.agent.to_report['feat_var'] = tf.reduce_mean(
            tf.nn.moments(self.feature_extractor.features, [0, 1])[1])

    def _set_env_vars(self):
        env = self.make_env(0, add_monitor=False)
        self.ob_space, self.ac_space = env.observation_space, env.action_space
        self.ob_mean, self.ob_std = random_agent_ob_mean_std(env, nsteps=10000)
        del env
        self.envs = [
            functools.partial(self.make_env, i) for i in range(self.num_env)
        ]

    def train(self, args):
        self.agent.start_interaction(self.envs,
                                     nlump=self.hps['nlumps'],
                                     dynamics=self.dynamics)
        sess = tf.get_default_session()
        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)
        checkdir = osp.join(logger.get_dir(), 'checkpoints',
                            args['env'] + '-' + args['policy'])
        os.makedirs(checkdir, exist_ok=True)
        load_weights = args['load_weights']
        start_nupdates = 0
        if load_weights is not None:
            load_path = osp.join(checkdir, load_weights)
            start_nupdates = int(load_weights)
            print('Loading checkpoint from %s ' % load_weights)
            self.load(load_path)

        while True:
            info = self.agent.step()
            if info['update']:
                print('task = ', args['env'], ' num_env = ', args['num_env'],
                      ' policy = ', args['policy'])
                info['update']['n_updates'] += start_nupdates
                info['update']['tcount'] += start_nupdates * args[
                    'nsteps_per_env'] * args['num_env']
                logger.logkvs(info['update'])
                logger.dumpkvs()
                print('Time elapsed ' + str(
                    datetime.timedelta(seconds=info['update']['total_secs'])))
                if info['update']['n_updates'] % 10 == 0 or info['update'][
                        'n_updates'] == 1:
                    weights_index = info['update']['n_updates']
                    savepath = osp.join(checkdir, '%.5i' % weights_index)
                    print('Saving to', savepath)
                    self.save(savepath)

            if self.agent.rollout.stats['tcount'] > self.num_timesteps:
                break
        self.agent.stop_interaction()
예제 #8
0
class Trainer(object):
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self.save_interval = hps['save_interval']
        self._set_env_vars()

        self.policy = CnnPolicy(scope='pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=512,
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=tf.nn.leaky_relu)
        # add policy to collections
        tf.add_to_collection('policy', self.policy)
        self.feature_extractor = {
            "none": FeatureExtractor,
            "idf": InverseDynamics,
            "vaesph": partial(VAE, spherical_obs=True),
            "vaenonsph": partial(VAE, spherical_obs=False),
            "pix2pix": JustPixels
        }[hps['feat_learning']]
        self.feature_extractor = self.feature_extractor(
            policy=self.policy,
            features_shared_with_policy=False,
            feat_dim=512,
            layernormalize=hps['layernorm'])

        self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet
        self.dynamics = self.dynamics(
            auxiliary_task=self.feature_extractor,
            predict_from_pixels=hps['dyn_from_pixels'],
            feat_dim=512)

        self.agent = PpoOptimizer(scope='ppo',
                                  ob_space=self.ob_space,
                                  ac_space=self.ac_space,
                                  stochpol=self.policy,
                                  use_news=hps['use_news'],
                                  gamma=hps['gamma'],
                                  lam=hps["lambda"],
                                  nepochs=hps['nepochs'],
                                  nminibatches=hps['nminibatches'],
                                  lr=hps['lr'],
                                  cliprange=0.1,
                                  nsteps_per_seg=hps['nsteps_per_seg'],
                                  nsegs_per_env=hps['nsegs_per_env'],
                                  ent_coef=hps['ent_coeff'],
                                  normrew=hps['norm_rew'],
                                  normadv=hps['norm_adv'],
                                  ext_coeff=hps['ext_coeff'],
                                  int_coeff=hps['int_coeff'],
                                  dynamics=self.dynamics,
                                  n_eval_steps=hps['n_eval_steps'])
        # add policy to collections
        tf.add_to_collection('agent', self.agent)

        self.agent.to_report['aux'] = tf.reduce_mean(
            self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report['aux']
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        self.agent.total_loss += self.agent.to_report['dyn_loss']
        self.agent.to_report['feat_var'] = tf.reduce_mean(
            tf.nn.moments(self.feature_extractor.features, [0, 1])[1])

    def _set_env_vars(self):
        env = self.make_env(0, add_monitor=False)
        self.ob_space, self.ac_space = env.observation_space, env.action_space
        self.ob_mean, self.ob_std = random_agent_ob_mean_std(env)
        del env
        self.envs = [
            functools.partial(self.make_env, i)
            for i in range(self.envs_per_process)
        ]

    def train(self, saver, sess, restore=False):

        self.agent.start_interaction(self.envs,
                                     nlump=self.hps['nlumps'],
                                     dynamics=self.dynamics)
        write_meta_graph = False
        saves = 0
        loops = 0
        while True:

            info = self.agent.step(eval=False)

            if info is not None:
                if info['update'] and not restore:
                    logger.logkvs(info['update'])
                    logger.dumpkvs()

            steps = self.agent.rollout.stats['tcount']

            if loops % 10 == 0:
                filename = args.saved_model_dir + 'model.ckpt'
                saver.save(sess,
                           filename,
                           global_step=int(saves),
                           write_meta_graph=False)
                saves += 1
            loops += 1

            if steps > self.num_timesteps:
                break

        self.agent.stop_interaction()

    def test(self, saver, sess):
        self.agent.start_interaction(self.envs,
                                     nlump=self.hps['nlumps'],
                                     dynamics=self.dynamics)
        print('loading model')
        saver.restore(sess, args.saved_model_dir + args.model_name)
        print('loaded model,', args.saved_model_dir + args.model_name)

        include_images = args.include_images and eval
        info = self.agent.step(eval=True, include_images=include_images)

        if info['update']:
            logger.logkvs(info['update'])
            logger.dumpkvs()

        # save actions, news, and / or images
        np.save(args.env + '_data.npy', info)

        print('EVALUATION COMPLETED')
        print('SAVED DATA IN CURRENT DIRECTORY')
        print('FILENAME', args.env + '_data.npy')

        self.agent.stop_interaction()
예제 #9
0
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self._set_env_vars()

        self.policy = CnnPolicy(
            scope="pol",
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            hidsize=512,
            feat_dim=512,
            ob_mean=self.ob_mean,
            ob_std=self.ob_std,
            layernormalize=False,
            nl=tf.nn.leaky_relu,
        )

        self.feature_extractor = {
            "none": FeatureExtractor,
            "idf": InverseDynamics,
            "vaesph": partial(VAE, spherical_obs=True),
            "vaenonsph": partial(VAE, spherical_obs=False),
            "pix2pix": JustPixels,
        }[hps["feat_learning"]]
        self.feature_extractor = self.feature_extractor(
            policy=self.policy,
            features_shared_with_policy=False,
            feat_dim=512,
            layernormalize=hps["layernorm"],
        )

        self.dynamics = Dynamics if hps["feat_learning"] != "pix2pix" else UNet
        self.dynamics = self.dynamics(
            auxiliary_task=self.feature_extractor,
            predict_from_pixels=hps["dyn_from_pixels"],
            feat_dim=512,
            ama=hps["ama"],
            uncertainty_penalty=hps["uncertainty_penalty"],
            clip_ama=hps["clip_ama"],
            clip_val=hps["clip_val"],
            reward_scaling=hps["reward_scaling"],
            abs_ama=hps["abs_ama"])
        self.agent = PpoOptimizer(
            scope="ppo",
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            stochpol=self.policy,
            use_news=hps["use_news"],
            gamma=hps["gamma"],
            lam=hps["lambda"],
            nepochs=hps["nepochs"],
            nminibatches=hps["nminibatches"],
            lr=hps["lr"],
            cliprange=0.1,
            nsteps_per_seg=hps["nsteps_per_seg"],
            nsegs_per_env=hps["nsegs_per_env"],
            ent_coef=hps["ent_coeff"],
            normrew=hps["norm_rew"],
            normadv=hps["norm_adv"],
            ext_coeff=hps["ext_coeff"],
            int_coeff=hps["int_coeff"],
            dynamics=self.dynamics,
            args=hps,
        )

        self.agent.to_report["aux"] = tf.reduce_mean(
            self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report["aux"]
        self.agent.to_report["dyn_loss"] = tf.reduce_mean(
            self.dynamics.loss[0])
        self.agent.total_loss += self.agent.to_report["dyn_loss"]
        self.agent.to_report["feat_var"] = tf.reduce_mean(
            tf.nn.moments(self.feature_extractor.features, [0, 1])[1])
예제 #10
0
class Trainer(object):
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self._set_env_vars()

        self.policy = CnnPolicy(
            scope="pol",
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            hidsize=512,
            feat_dim=512,
            ob_mean=self.ob_mean,
            ob_std=self.ob_std,
            layernormalize=False,
            nl=tf.nn.leaky_relu,
        )

        self.feature_extractor = {
            "none": FeatureExtractor,
            "idf": InverseDynamics,
            "vaesph": partial(VAE, spherical_obs=True),
            "vaenonsph": partial(VAE, spherical_obs=False),
            "pix2pix": JustPixels,
        }[hps["feat_learning"]]
        self.feature_extractor = self.feature_extractor(
            policy=self.policy,
            features_shared_with_policy=False,
            feat_dim=512,
            layernormalize=hps["layernorm"],
        )

        self.dynamics = Dynamics if hps["feat_learning"] != "pix2pix" else UNet
        self.dynamics = self.dynamics(
            auxiliary_task=self.feature_extractor,
            predict_from_pixels=hps["dyn_from_pixels"],
            feat_dim=512,
            ama=hps["ama"],
            uncertainty_penalty=hps["uncertainty_penalty"],
            clip_ama=hps["clip_ama"],
            clip_val=hps["clip_val"],
            reward_scaling=hps["reward_scaling"],
            abs_ama=hps["abs_ama"])
        self.agent = PpoOptimizer(
            scope="ppo",
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            stochpol=self.policy,
            use_news=hps["use_news"],
            gamma=hps["gamma"],
            lam=hps["lambda"],
            nepochs=hps["nepochs"],
            nminibatches=hps["nminibatches"],
            lr=hps["lr"],
            cliprange=0.1,
            nsteps_per_seg=hps["nsteps_per_seg"],
            nsegs_per_env=hps["nsegs_per_env"],
            ent_coef=hps["ent_coeff"],
            normrew=hps["norm_rew"],
            normadv=hps["norm_adv"],
            ext_coeff=hps["ext_coeff"],
            int_coeff=hps["int_coeff"],
            dynamics=self.dynamics,
            args=hps,
        )

        self.agent.to_report["aux"] = tf.reduce_mean(
            self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report["aux"]
        self.agent.to_report["dyn_loss"] = tf.reduce_mean(
            self.dynamics.loss[0])
        self.agent.total_loss += self.agent.to_report["dyn_loss"]
        self.agent.to_report["feat_var"] = tf.reduce_mean(
            tf.nn.moments(self.feature_extractor.features, [0, 1])[1])

    def _set_env_vars(self):
        env = self.make_env(0, add_monitor=False)
        self.ob_space, self.ac_space = env.observation_space, env.action_space
        self.ob_mean, self.ob_std = random_agent_ob_mean_std(env)
        del env
        self.envs = [
            functools.partial(self.make_env, i)
            for i in range(self.envs_per_process)
        ]

    def train(self):
        import random

        self.agent.start_interaction(self.envs,
                                     nlump=self.hps["nlumps"],
                                     dynamics=self.dynamics)
        count = 0
        while True:
            count += 1
            info = self.agent.step()
            if info["update"]:
                logger.logkvs(info["update"])
                logger.dumpkvs()
            if self.hps["feat_learning"] == "pix2pix":
                making_video = random.choice(99 * [False] + [True])
            else:
                making_video = False
            self.agent.rollout.making_video = making_video
            for a_key in info.keys():
                wandb.log(info[a_key])
            wandb.log(
                {"average_sigma": np.mean(self.agent.rollout.buf_sigmas)})
            # going to have to log it here
            if self.agent.rollout.stats["tcount"] > self.num_timesteps:
                break

        self.agent.stop_interaction()
예제 #11
0
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self._set_env_vars()

        self.policy = CnnPolicy(scope='pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=hps['feat_dim'],
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=tf.nn.leaky_relu)

        self.feature_extractor = {
            "none": FeatureExtractor,
            "idf": InverseDynamics,
            "vaesph": partial(VAE, spherical_obs=True),
            "vaenonsph": partial(VAE, spherical_obs=False),
            "pix2pix": JustPixels
        }[hps['feat_learning']]
        self.feature_extractor = self.feature_extractor(
            policy=self.policy,
            features_shared_with_policy=False,
            feat_dim=hps['feat_dim'],
            layernormalize=hps['layernorm'])

        self.intrinsic_model = IntrinsicModel if hps[
            'feat_learning'] != 'pix2pix' else UNet
        self.intrinsic_model = self.intrinsic_model(
            auxiliary_task=self.feature_extractor,
            predict_from_pixels=hps['dyn_from_pixels'],
            feature_space=hps['feature_space'],
            nsteps_per_seg=hps['nsteps_per_seg'],
            feat_dim=hps['feat_dim'],
            naudio_samples=hps['naudio_samples'],
            train_discriminator=hps['train_discriminator'],
            discriminator_weighted=hps['discriminator_weighted'],
            noise_multiplier=hps['noise_multiplier'],
            concat=hps['concat'],
            log_dir=logger.get_dir(),
            make_video=hps['checkpoint_path'] != '')

        self.agent = PpoOptimizer(scope='ppo',
                                  ob_space=self.ob_space,
                                  ac_space=self.ac_space,
                                  stochpol=self.policy,
                                  use_news=hps['use_news'],
                                  gamma=hps['gamma'],
                                  lam=hps["lambda"],
                                  nepochs=hps['nepochs'],
                                  nminibatches=hps['nminibatches'],
                                  lr=hps['lr'],
                                  cliprange=0.1,
                                  nsteps_per_seg=hps['nsteps_per_seg'],
                                  nsegs_per_env=hps['nsegs_per_env'],
                                  ent_coef=hps['ent_coeff'],
                                  normrew=hps['norm_rew'],
                                  normadv=hps['norm_adv'],
                                  ext_coeff=hps['ext_coeff'],
                                  int_coeff=hps['int_coeff'],
                                  feature_space=hps['feature_space'],
                                  intrinsic_model=self.intrinsic_model,
                                  log_dir=logger.get_dir(),
                                  checkpoint_path=hps['checkpoint_path'])

        self.agent.to_report['aux'] = tf.reduce_mean(
            self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report['aux']
        if hps['feature_space'] == 'joint':
            self.agent.to_report['dyn_visual_loss'] = tf.reduce_mean(
                self.intrinsic_model.visual_loss)
            self.agent.to_report['dyn_audio_loss'] = tf.reduce_mean(
                self.intrinsic_model.audio_loss)
            self.agent.to_report['discrim_train_loss'] = tf.reduce_mean(
                self.intrinsic_model.discrim_train_loss)
            self.agent.to_report['intrinsic_model_loss'] = tf.reduce_mean(
                self.intrinsic_model.loss)
        elif hps['train_discriminator']:
            self.agent.to_report['intrinsic_model_loss'] = tf.reduce_mean(
                self.intrinsic_model.discrim_train_loss)
        else:
            self.agent.to_report['intrinsic_model_loss'] = tf.reduce_mean(
                self.intrinsic_model.loss)
        self.agent.total_loss += self.agent.to_report['intrinsic_model_loss']
        self.agent.to_report['feat_var'] = tf.reduce_mean(
            tf.nn.moments(self.feature_extractor.features, [0, 1])[1])
예제 #12
0
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self._set_env_vars(
        )  # 初始化 ob_space,ac_space,ob_mean,ob_std, 初始化 self.envs 包含多个环境模型

        self.policy = CnnPolicy(scope='pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=512,
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=tf.nn.leaky_relu)

        # 在建立环境模型之前, 先初始特征提取器. 定义在 auxiliary_task.py 中. 其中 pix2pix 相当于没有提取特征.
        self.feature_extractor = {
            "none": FeatureExtractor,  # 默认是none
            "idf": InverseDynamics,
            "vaesph": partial(VAE, spherical_obs=True),
            "vaenonsph": partial(VAE, spherical_obs=False),
            "pix2pix": JustPixels
        }[hps['feat_learning']]  # 通过hps参数选择一个特征提取器
        self.feature_extractor = self.feature_extractor(
            policy=self.policy,
            features_shared_with_policy=False,
            feat_dim=512,
            layernormalize=hps['layernorm'])

        # 初始化 环境模型 的类. 上述定义的 feature_extractor 将作为一个参数传入
        self.dynamics = DvaeDynamics(auxiliary_task=self.feature_extractor,
                                     reward_type=hps['reward_type'],
                                     sample_seeds=hps['sample_seeds'])

        self.agent = PpoOptimizer(
            scope='ppo',
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            stochpol=self.policy,
            use_news=hps['use_news'],
            gamma=hps['gamma'],
            lam=hps["lambda"],
            nepochs=hps['nepochs'],
            nminibatches=hps['nminibatches'],
            lr=hps['lr'],
            cliprange=0.1,
            nsteps_per_seg=hps['nsteps_per_seg'],
            nsegs_per_env=hps['nsegs_per_env'],
            ent_coef=hps['ent_coeff'],
            normrew=hps['norm_rew'],
            normadv=hps['norm_adv'],
            ext_coeff=hps['ext_coeff'],
            int_coeff=hps['int_coeff'],
            dynamics=self.dynamics,  # dynamic 对象
            nepochs_dvae=hps["nepochs_dvae"]  # 额外训练 dynamic 的次数
        )

        # agent 损失: 包括 actor,critic,entropy 损失; 先在加上 feature 学习时包含的损失
        self.agent.to_report['aux'] = tf.reduce_mean(
            self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report['aux']

        # dynamic 损失
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        self.agent.total_loss += self.agent.to_report['dyn_loss']

        # add bai. 单独记录DAVE损失,可能要多次训练DAVE
        self.agent.dynamics_loss = self.agent.to_report['dyn_loss']

        # 计算状态经过辅助任务提取特征的方差, shape=(512,), 下面取 tf.reduce_mean 后是一个标量
        self.agent.to_report['feat_var'] = tf.reduce_mean(
            tf.nn.moments(self.feature_extractor.features, [0, 1])[1])
예제 #13
0
class Scorer(object):
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        # self._set_env_vars()

        self.ob_mean, self.ob_std, self.ob_space, self.ac_space = random_agent_ob_mean_std(
            None, hps['env'], nsteps=1, load=True)
        # env = self.make_env(256, add_monitor=False, sleep_multiple=1./32)
        # self.ob_space, self.ac_space = env.observation_space, env.action_space
        # env.close()
        # del env

        self.envs = [
            functools.partial(self.make_env, i + 256 + 1)
            for i in range(envs_per_process)
        ]

        self.policy = CnnPolicy(scope='pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=512,
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=tf.nn.leaky_relu)

        self.feature_extractor = {
            "none": FeatureExtractor,
            "idf": InverseDynamics,
            "vaesph": partial(VAE, spherical_obs=True),
            "vaenonsph": partial(VAE, spherical_obs=False),
            "pix2pix": JustPixels
        }[hps['feat_learning']]
        self.feature_extractor = self.feature_extractor(
            policy=self.policy,
            features_shared_with_policy=False,
            feat_dim=512,
            layernormalize=hps['layernorm'])

        self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet
        self.dynamics = self.dynamics(
            auxiliary_task=self.feature_extractor,
            predict_from_pixels=hps['dyn_from_pixels'],
            feat_dim=512)

        self.agent = PpoOptimizer(
            scope='ppo',
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            stochpol=self.policy,
            use_news=hps['use_news'],
            gamma=hps['gamma'],
            lam=hps["lambda"],
            nepochs=hps['nepochs'],
            nminibatches=hps['nminibatches'],
            lr=hps['lr'],
            cliprange=0.1,
            nsteps_per_seg=hps['nsteps_per_seg'],
            nsegs_per_env=hps['nsegs_per_env'],
            ent_coef=hps['ent_coeff'],
            normrew=hps['norm_rew'],
            normadv=hps['norm_adv'],
            ext_coeff=hps['ext_coeff'],
            int_coeff=hps['int_coeff'],
            dynamics=self.dynamics,
            load=hps['load'],
            exp_name=hps['exp_name'],
        )

        self.agent.to_report['aux'] = tf.reduce_mean(
            self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report['aux']
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        self.agent.total_loss += self.agent.to_report['dyn_loss']
        self.agent.to_report['feat_var'] = tf.reduce_mean(
            tf.nn.moments(self.feature_extractor.features, [0, 1])[1])

    def score(self):
        self.agent.start_interaction(self.envs,
                                     nlump=self.hps['nlumps'],
                                     dynamics=self.dynamics)
        from time import sleep
        sleep(2)
        episode_reward = 0
        episode_rewards = []
        total_episodes = 0
        max_level = 0
        max_levels = []
        while True:
            # info = self.agent.step()
            # self.agent.rollout.collect_rollout()
            obs, prevrews, news, infos = self.agent.rollout.env_get(0)
            if prevrews is not None:
                episode_reward += prevrews
                if prevrews == 1:
                    max_level += 1
                if news:
                    episode_rewards.append(episode_reward)
                    ave_reward = sum(episode_rewards) / len(episode_rewards)
                    total_episodes += 1
                    max_levels.append(max_level)
                    ave_level = sum(max_levels) / len(max_levels)
                    ave_level = np.around(ave_level, 2)
                    ave_reward = np.around(ave_reward, 2)
                    print('ep:', total_episodes, 'level:', max_level,
                          'ave_level:', ave_level, 'episode_reward:',
                          episode_reward, 'ave_reward', ave_reward)
                    episode_reward = 0
                    max_level = 0
                    if total_episodes >= 25:
                        break
            # acs, vpreds, nlps = self.agent.rollout.policy.get_ac_value_nlp(obs)
            # self.agent.rollout.env_step(0, acs)
            acs, vpreds, nlps = self.policy.get_ac_value_nlp(obs)
            self.agent.rollout.env_step(0, acs)
            self.agent.rollout.step_count += 1

        self.agent.stop_interaction()
예제 #14
0
    def __init__(self, make_env, hps, num_timesteps, envs_per_process, exp_name=None, env_name=None, policy=None, feat_ext=None, dyn=None, agent_num=None, restore_name=None):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.depth_pred = hps['depth_pred']
        self.aux_input = hps['aux_input']
        self.num_timesteps = num_timesteps
        self._set_env_vars()
        if exp_name:
            self.exp_name = exp_name
        else:
            self.exp_name = hps['exp_name']
        if env_name:
            self.env_name = env_name
        else:
            self.env_name = hps['env']
        if policy is None:
            if hps['lstm']:
                self.policy = LSTMPolicy(
                    scope='pol',
                    ob_space=self.ob_space,
                    ac_space=self.ac_space,
                    hidsize=512,
                    batchsize=hps['envs_per_process'],
                    feat_dim=512,
                    ob_mean=self.ob_mean,
                    ob_std=self.ob_std,
                    lstm1_size=hps['lstm1_size'],
                    lstm2_size=hps['lstm2_size'],
                    layernormalize=False,
                    nl=tf.nn.leaky_relu,
                    depth_pred=hps['depth_pred'],
                    aux_input=hps['aux_input'],
                )

            else:
                self.policy = CnnPolicy(
                    scope='pol',
                    ob_space=self.ob_space,
                    ac_space=self.ac_space,
                    hidsize=512,
                    feat_dim=512,
                    ob_mean=self.ob_mean,
                    ob_std=self.ob_std,
                    layernormalize=False,
                    nl=tf.nn.leaky_relu
                )
        else:
            self.policy = policy
            self.policy.restore()

        if feat_ext:
            self.feature_extractor = feat_ext
            self.feature_extractor.restore()
        else:

            self.feature_extractor = {"none": FeatureExtractor,
                                      "idf": InverseDynamics,
                                      "vaesph": partial(VAE, spherical_obs=True),
                                      "vaenonsph": partial(VAE, spherical_obs=False),
                                      "pix2pix": JustPixels}[hps['feat_learning']]

            self.feature_extractor = self.feature_extractor(policy=self.policy,
                                                            features_shared_with_policy=hps['feat_share'],
                                                            feat_dim=hps['dyn_feat_dim'],
                                                            layernormalize=hps['layernorm'])
        if dyn:
            self.dynamics = dyn
            self.dynamics.restore()
        else:

            self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet
            self.dynamics = self.dynamics(auxiliary_task=self.feature_extractor,
                                          predict_from_pixels=hps['dyn_from_pixels'],
                                          feat_dim=hps['dyn_feat_dim'])
        self.agent = PpoOptimizer(
            hps=hps,
            scope='ppo',
            ob_space=self.ob_space,
            env_ob_space=self.env_ob_space,
            ac_space=self.ac_space,
            stochpol=self.policy,
            use_news=hps['use_news'],
            gamma=hps['gamma'],
            lam=hps["lambda"],
            nepochs=hps['nepochs'],
            nminibatches=hps['nminibatches'],
            lr=hps['lr'],
            cliprange=0.1,
            nsteps_per_seg=hps['nsteps_per_seg'],
            nsegs_per_env=hps['nsegs_per_env'],
            ent_coef=hps['ent_coeff'],
            normrew=hps['norm_rew'],
            normadv=hps['norm_adv'],
            ext_coeff=hps['ext_coeff'],
            int_coeff=hps['int_coeff'],
            dynamics=self.dynamics,
            exp_name=self.exp_name,
            env_name=self.env_name,
            video_log_freq=hps['video_log_freq'],
            model_save_freq=hps['model_save_freq'],
            use_apples=hps['use_apples'],
            agent_num=agent_num,
            restore_name=restore_name,
            multi_envs=hps['multi_train_envs'],
            lstm=hps['lstm'],
            lstm1_size=hps['lstm1_size'],
            lstm2_size=hps['lstm2_size'],
            depth_pred=hps['depth_pred'],
            aux_input=hps['aux_input'],
            beta_d=hps['beta'],
            early_stop=hps['early_stop'],
            optim=hps['optim'],
            decay=hps['decay'],
            grad_clip=hps['grad_clip'],
            log_grads=hps['log_grads'],
            logdir=hps['logdir']
        )
        self.agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss)
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        self.agent.to_report['feat_var'] = tf.reduce_mean(tf.nn.moments(self.feature_extractor.features, [0,1])[1])
        if hps['curiosity']:
            #self.agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss)
            self.agent.total_loss += hps['aux_loss_coeff']*self.agent.to_report['aux']
            #self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
            self.agent.total_loss += hps['dyn_loss_coeff']*self.agent.to_report['dyn_loss']
예제 #15
0
class Trainer(object):
    from baselines import logger
    def __init__(self, make_env, hps, num_timesteps, envs_per_process, exp_name=None, env_name=None, policy=None, feat_ext=None, dyn=None, agent_num=None, restore_name=None):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.depth_pred = hps['depth_pred']
        self.aux_input = hps['aux_input']
        self.num_timesteps = num_timesteps
        self._set_env_vars()
        if exp_name:
            self.exp_name = exp_name
        else:
            self.exp_name = hps['exp_name']
        if env_name:
            self.env_name = env_name
        else:
            self.env_name = hps['env']
        if policy is None:
            if hps['lstm']:
                self.policy = LSTMPolicy(
                    scope='pol',
                    ob_space=self.ob_space,
                    ac_space=self.ac_space,
                    hidsize=512,
                    batchsize=hps['envs_per_process'],
                    feat_dim=512,
                    ob_mean=self.ob_mean,
                    ob_std=self.ob_std,
                    lstm1_size=hps['lstm1_size'],
                    lstm2_size=hps['lstm2_size'],
                    layernormalize=False,
                    nl=tf.nn.leaky_relu,
                    depth_pred=hps['depth_pred'],
                    aux_input=hps['aux_input'],
                )

            else:
                self.policy = CnnPolicy(
                    scope='pol',
                    ob_space=self.ob_space,
                    ac_space=self.ac_space,
                    hidsize=512,
                    feat_dim=512,
                    ob_mean=self.ob_mean,
                    ob_std=self.ob_std,
                    layernormalize=False,
                    nl=tf.nn.leaky_relu
                )
        else:
            self.policy = policy
            self.policy.restore()

        if feat_ext:
            self.feature_extractor = feat_ext
            self.feature_extractor.restore()
        else:

            self.feature_extractor = {"none": FeatureExtractor,
                                      "idf": InverseDynamics,
                                      "vaesph": partial(VAE, spherical_obs=True),
                                      "vaenonsph": partial(VAE, spherical_obs=False),
                                      "pix2pix": JustPixels}[hps['feat_learning']]

            self.feature_extractor = self.feature_extractor(policy=self.policy,
                                                            features_shared_with_policy=hps['feat_share'],
                                                            feat_dim=hps['dyn_feat_dim'],
                                                            layernormalize=hps['layernorm'])
        if dyn:
            self.dynamics = dyn
            self.dynamics.restore()
        else:

            self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet
            self.dynamics = self.dynamics(auxiliary_task=self.feature_extractor,
                                          predict_from_pixels=hps['dyn_from_pixels'],
                                          feat_dim=hps['dyn_feat_dim'])
        self.agent = PpoOptimizer(
            hps=hps,
            scope='ppo',
            ob_space=self.ob_space,
            env_ob_space=self.env_ob_space,
            ac_space=self.ac_space,
            stochpol=self.policy,
            use_news=hps['use_news'],
            gamma=hps['gamma'],
            lam=hps["lambda"],
            nepochs=hps['nepochs'],
            nminibatches=hps['nminibatches'],
            lr=hps['lr'],
            cliprange=0.1,
            nsteps_per_seg=hps['nsteps_per_seg'],
            nsegs_per_env=hps['nsegs_per_env'],
            ent_coef=hps['ent_coeff'],
            normrew=hps['norm_rew'],
            normadv=hps['norm_adv'],
            ext_coeff=hps['ext_coeff'],
            int_coeff=hps['int_coeff'],
            dynamics=self.dynamics,
            exp_name=self.exp_name,
            env_name=self.env_name,
            video_log_freq=hps['video_log_freq'],
            model_save_freq=hps['model_save_freq'],
            use_apples=hps['use_apples'],
            agent_num=agent_num,
            restore_name=restore_name,
            multi_envs=hps['multi_train_envs'],
            lstm=hps['lstm'],
            lstm1_size=hps['lstm1_size'],
            lstm2_size=hps['lstm2_size'],
            depth_pred=hps['depth_pred'],
            aux_input=hps['aux_input'],
            beta_d=hps['beta'],
            early_stop=hps['early_stop'],
            optim=hps['optim'],
            decay=hps['decay'],
            grad_clip=hps['grad_clip'],
            log_grads=hps['log_grads'],
            logdir=hps['logdir']
        )
        self.agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss)
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        self.agent.to_report['feat_var'] = tf.reduce_mean(tf.nn.moments(self.feature_extractor.features, [0,1])[1])
        if hps['curiosity']:
            #self.agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss)
            self.agent.total_loss += hps['aux_loss_coeff']*self.agent.to_report['aux']
            #self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
            self.agent.total_loss += hps['dyn_loss_coeff']*self.agent.to_report['dyn_loss']
            #self.agent.to_report['feat_var'] = tf.reduce_mean(tf.nn.moments(self.feature_extractor.features, [0, 1])[1])

    def _set_env_vars(self):
        import numpy as np
        env = self.make_env(0, add_monitor=False)
        self.ob_space, self.ac_space = env.observation_space, env.action_space
        self.env_ob_space = env.observation_space
        if self.depth_pred:
            self.ob_space = gym.spaces.Box(0, 255, shape=(84,84,3), dtype=np.uint8)
        self.ob_mean, self.ob_std = random_agent_ob_mean_std(env, depth_pred=self.hps['depth_pred'])
        del env
        self.envs = [functools.partial(self.make_env, i) for i in range(self.envs_per_process)]

    def train(self, saver, sess, restore=False):
        from baselines import logger
        self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics)
        if restore:
            print("Restoring model for training")
            saver.restore(sess, "models/" + self.hps['restore_model'] + ".ckpt")
            print("Loaded model", self.hps['restore_model'])
        write_meta_graph = False
        while True:
            info = self.agent.step()
            if info['update']:
                if info['update']['recent_best_ext_ret'] is None:
                    info['update']['recent_best_ext_ret'] = 0
                wandb.log(info['update'])
                logger.logkvs(info['update'])
                logger.dumpkvs()
            if self.agent.rollout.stats['tcount'] > self.num_timesteps:
                break
        if self.hps['tune_env']:
            filename = "models/" + self.hps['restore_model'] + "_tune_on_" + self.hps['tune_env'] + "_final.ckpt"
        else:
            filename = "models/" + self.hps['exp_name'] + "_final.ckpt"
        saver.save(sess, filename, write_meta_graph=False)
        self.policy.save_model(self.hps['exp_name'], 'final')
        self.agent.stop_interaction()
예제 #16
0
파일: run.py 프로젝트: ijcai-261/ijcai-261
class Trainer(object):
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self.save_checkpoint = hps['save_checkpoint']
        self._set_env_vars()

        self.policy = CnnPolicy(scope='pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=512,
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=tf.nn.leaky_relu)

        self.feature_extractor = {
            "none": FeatureExtractor,
            "idf": InverseDynamics,
            "vaesph": partial(VAE, spherical_obs=True),
            "vaenonsph": partial(VAE, spherical_obs=False),
            "pix2pix": JustPixels,
            "flowS": OpticalFlowFeatureExtractor,
            "flowC": OpticalFlowFeatureExtractor
        }[hps['feat_learning']]

        if 'flow' in hps['feat_learning']:
            self.feature_extractor = self.feature_extractor(
                policy=self.policy,
                FICM_type=hps['feat_learning'],
                fix_features=hps['fix_features'])

            self.dynamics = FlowDynamics(auxiliary_task=self.feature_extractor,
                                         FICM_type=hps['feat_learning'])

        else:
            self.feature_extractor = self.feature_extractor(
                policy=self.policy,
                features_shared_with_policy=False,
                feat_dim=512,
                layernormalize=hps['layernorm'])

            self.dynamics = Dynamics if hps[
                'feat_learning'] != 'pix2pix' else UNet

            self.dynamics = self.dynamics(
                auxiliary_task=self.feature_extractor,
                predict_from_pixels=hps['dyn_from_pixels'],
                feat_dim=512)

        self.agent = PpoOptimizer(scope='ppo',
                                  ob_space=self.ob_space,
                                  ac_space=self.ac_space,
                                  stochpol=self.policy,
                                  use_news=hps['use_news'],
                                  gamma=hps['gamma'],
                                  lam=hps["lambda"],
                                  nepochs=hps['nepochs'],
                                  nminibatches=hps['nminibatches'],
                                  lr=hps['lr'],
                                  cliprange=0.1,
                                  nsteps_per_seg=hps['nsteps_per_seg'],
                                  nsegs_per_env=hps['nsegs_per_env'],
                                  ent_coef=hps['ent_coeff'],
                                  normrew=hps['norm_rew'],
                                  normadv=hps['norm_adv'],
                                  ext_coeff=hps['ext_coeff'],
                                  int_coeff=hps['int_coeff'],
                                  dynamics=self.dynamics,
                                  flow_lr=hps['flow_lr'],
                                  update_periods=hps['update_periods'])

        self.agent.to_report['aux'] = tf.reduce_mean(
            self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report['aux']
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        self.agent.total_loss += self.agent.to_report['dyn_loss']

        if 'flow' not in hps['feat_learning']:
            self.agent.to_report['feat_var'] = tf.reduce_mean(
                tf.nn.moments(self.feature_extractor.features, [0, 1])[1])

    def _set_env_vars(self):
        env = self.make_env(0, add_monitor=False)
        self.ob_space, self.ac_space = env.observation_space, env.action_space
        self.ob_mean, self.ob_std = random_agent_ob_mean_std(env)
        del env
        self.envs = [
            functools.partial(self.make_env, i)
            for i in range(self.envs_per_process)
        ]

    def save(self, saver, save_dir, step):
        model_name = 'model.ckpt'
        checkpoint_path = os.path.join(save_dir, model_name)

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        saver.save(tf.get_default_session(), checkpoint_path, global_step=step)
        print('The checkpoint has been created, step: {}'.format(step))

    def train(self):
        if self.save_checkpoint:
            params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
            saver = tf.train.Saver(var_list=params,
                                   max_to_keep=self.num_timesteps // 1000000 +
                                   1)

            periods = list(range(0, self.num_timesteps + 1, 1000000))
            idx = 0

        self.agent.start_interaction(self.envs,
                                     nlump=self.hps['nlumps'],
                                     dynamics=self.dynamics)
        while True:
            info = self.agent.step()

            if info['update']:
                logger.logkvs(info['update'])
                logger.dumpkvs()

            if self.save_checkpoint:
                if self.agent.rollout.stats['tcount'] >= periods[idx]:
                    self.save(saver,
                              logger.get_dir() + '/checkpoint/', periods[idx])
                    idx += 1

            if self.agent.rollout.stats['tcount'] > self.num_timesteps:
                break

        self.agent.stop_interaction()
예제 #17
0
class Trainer(object):
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self._set_env_vars()

        self.policy = CnnPolicy(scope='pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=512,
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=tf.nn.leaky_relu)

        self.feature_extractor = {
            "none": FeatureExtractor,
            "idf": InverseDynamics,
            "vaesph": partial(VAE, spherical_obs=True),
            "vaenonsph": partial(VAE, spherical_obs=False),
            "pix2pix": JustPixels
        }[hps['feat_learning']]
        self.feature_extractor = self.feature_extractor(
            policy=self.policy,
            features_shared_with_policy=False,
            feat_dim=512,
            layernormalize=hps['layernorm'])

        self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet
        self.dynamics = self.dynamics(
            auxiliary_task=self.feature_extractor,
            predict_from_pixels=hps['dyn_from_pixels'],
            feat_dim=512)

        self.agent = PpoOptimizer(scope='ppo',
                                  ob_space=self.ob_space,
                                  ac_space=self.ac_space,
                                  stochpol=self.policy,
                                  use_news=hps['use_news'],
                                  gamma=hps['gamma'],
                                  lam=hps["lambda"],
                                  nepochs=hps['nepochs'],
                                  nminibatches=hps['nminibatches'],
                                  lr=hps['lr'],
                                  cliprange=0.1,
                                  nsteps_per_seg=hps['nsteps_per_seg'],
                                  nsegs_per_env=hps['nsegs_per_env'],
                                  ent_coef=hps['ent_coeff'],
                                  normrew=hps['norm_rew'],
                                  normadv=hps['norm_adv'],
                                  ext_coeff=hps['ext_coeff'],
                                  int_coeff=hps['int_coeff'],
                                  dynamics=self.dynamics)

        self.agent.to_report['aux'] = tf.reduce_mean(
            self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report['aux']
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        self.agent.total_loss += self.agent.to_report['dyn_loss']
        self.agent.to_report['feat_var'] = tf.reduce_mean(
            tf.nn.moments(self.feature_extractor.features, [0, 1])[1])

    def _set_env_vars(self):
        from time import sleep

        env = self.make_env(0, add_monitor=False)
        self.ob_space, self.ac_space = env.observation_space, env.action_space
        self.ob_mean, self.ob_std = random_agent_ob_mean_std(env)
        env.close()
        print("Waiting for 1 minute to make sure socket is closed on Linux")
        sleep(60)
        del env
        self.envs = [
            functools.partial(self.make_env, i)
            for i in range(self.envs_per_process)
        ]

    def train(self):
        self.agent.start_interaction(self.envs,
                                     nlump=self.hps['nlumps'],
                                     dynamics=self.dynamics)
        save_path = 'models'
        tf_sess = tf.get_default_session()
        # Create a saver.
        saver = tf.train.Saver(save_relative_paths=True)
        # if self.hps['restore_latest_checkpoint']:
        # Restore latest checkpoint if set in arguments
        # saver.restore(tf_sess, tf.train.latest_checkpoint(save_path))
        while True:
            info = self.agent.step()
            if info['update']:
                logger.logkvs(info['update'])
                logger.dumpkvs()
            if self.agent.rollout.stat['tcount'] > self.num_timesteps:
                break
            # Saving the model every 1,000 steps.
            if info['n_updates'] % 1000 == 0:
                # Append the step number to the checkpoint name:
                saver.save(tf_sess,
                           save_path + '/obstacle_tower',
                           global_step=int(self.agent.rollout.stats['tcount']))

        # Append the step number to the last checkpoint name:
        saver.save(tf_sess,
                   save_path + '/obstacle_tower',
                   global_step=int(self.agent.rollout.stats['tcount']))
        self.agent.stop_interaction()