def train(sess, env_id, num_timesteps, timesteps_per_actor, autoencoders, seed):
    from baselines.ppo1 import pposgd_novelty, mlp_policy_novelty, pposgd_novelty_projection, mlp_policy_mirror_novelty, \
        pposgd_mirror_novelty, pposgd_mirror_novelty_projection

    rank = MPI.COMM_WORLD.Get_rank()

    workerseed = seed * 50 + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None
    set_global_seeds(workerseed)

    env = gym.make(env_id)

    def policy_fn(name, ob_space, ac_space):  # pylint: disable=W0613
        return mlp_policy_mirror_novelty.MlpPolicyMirrorNovelty(name=name, ob_space=ob_space, ac_space=ac_space,
                                                                hid_size=64,
                                                                num_hid_layers=3,
                                                                mirror_loss=True,
                                                                observation_permutation=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                                                         10,

                                                                                         15, 16, 17, 18,

                                                                                         11, 12, 13, 14,

                                                                                         23, 24, 25, 26,

                                                                                         19, 20, 21, 22],

                                                                action_permutation=[4, 5, 6, 7,

                                                                                    0, 1, 2, 3]
                                                                )

    env.env.novel_autoencoders = autoencoders

    env = bench.Monitor(env, logger.get_dir() and
                        osp.join(logger.get_dir(), str(rank)))

    env.seed(seed + rank)

    print("AUTOENCODER LENGTH: ", len(autoencoders))
    model = pposgd_novelty.learn(env, policy_fn,
                                 max_timesteps=num_timesteps,
                                 # max_iters=30,
                                 timesteps_per_actorbatch=timesteps_per_actor,
                                 clip_param=0.2, entcoeff=0,
                                 optim_epochs=3, optim_stepsize=1e-3, optim_batchsize=64,
                                 gamma=0.99, lam=0.95,
                                 schedule='linear',
                                 callback=callback,
                                 # Following params are kwargs
                                 session=sess,
                                 gap=5,
                                 sym_loss_weight=1,
                                 )

    env.close()
    return model
Exemplo n.º 2
0
def train(sess, env_id, num_timesteps, timesteps_per_actor, autoencoders,
          seed):
    from baselines.ppo1 import pposgd_novelty, mlp_policy_novelty, pposgd_novelty_projection

    rank = MPI.COMM_WORLD.Get_rank()

    workerseed = seed * 50 + 10000 * MPI.COMM_WORLD.Get_rank(
    ) if seed is not None else None
    set_global_seeds(workerseed)

    env = gym.make(env_id)

    def policy_fn(name, ob_space, ac_space):  # pylint: disable=W0613
        return mlp_policy_novelty.MlpPolicyNovelty(
            name=name,
            ob_space=ob_space,
            ac_space=ac_space,
            hid_size=64,
            num_hid_layers=3,
        )

    env.env.novel_autoencoders = autoencoders

    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), str(rank)))

    env.seed(seed + rank)
    if len(autoencoders) == 0:
        print("NO AUTOENCODER!!")
        model = pposgd_novelty.learn(
            env,
            policy_fn,
            max_timesteps=num_timesteps,
            # max_iters=30,
            timesteps_per_actorbatch=timesteps_per_actor,
            clip_param=0.2,
            entcoeff=0,
            optim_epochs=3,
            optim_stepsize=1e-3,
            optim_batchsize=64,
            gamma=0.99,
            lam=0.95,
            schedule='linear',
            callback=callback,
            # Following params are kwargs
            session=sess,
            gap=5,
        )
    else:
        print("AUTOENCODER LENGTH: ", len(autoencoders))

        model = pposgd_novelty_projection.learn(
            env,
            policy_fn,
            max_timesteps=num_timesteps,
            # max_iters=30,
            timesteps_per_actorbatch=timesteps_per_actor,
            clip_param=0.2,
            entcoeff=0,
            optim_epochs=3,
            optim_stepsize=1e-3,
            optim_batchsize=64,
            gamma=0.99,
            lam=0.95,
            schedule='linear',
            callback=callback,
            # Following params are kwargs
            session=sess,
            gap=5,
        )
    env.close()
    return model