Exemplo n.º 1
0
 def policy_fn(name, ob_space, ac_space):
     return MlpPolicy(
         name=name,
         ob_space=ob_space,
         ac_space=ac_space,
         hid_size=32,
         num_hid_layers=2)
Exemplo n.º 2
0
 def policy_fn(name, ob_space, ac_space):
     return MlpPolicy(name=name,
                      ob_space=env.observation_space,
                      ac_space=env.action_space,
                      hid_sizes=config["hid_sizes"],
                      gaussian_fixed_var=config["gaussian_fixed_var"],
                      use_obfilter=config["use_obfilter"])
Exemplo n.º 3
0
 def policy_fn(name, ob_space, ac_space):
     return MlpPolicy(
         name=name,
         ob_space=ob_space,
         ac_space=ac_space,
         hid_size=64,  #params["hid_size"],
         num_hid_layers=1,  #params["num_hid_layers"]
     )
Exemplo n.º 4
0
 def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None):
     return MlpPolicy(name=name,
                      ob_space=ob_space,
                      ac_space=ac_space,
                      hid_size=32,
                      num_hid_layers=2,
                      sess=sess,
                      placeholders=placeholders)
Exemplo n.º 5
0
Arquivo: ppo.py Projeto: sjy-syz/BaRC
def create_policy(name, problem):
    ob_space = problem.observation_space
    ac_space = problem.action_space
    return MlpPolicy(name=name,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     hid_size=64,
                     num_hid_layers=2)
Exemplo n.º 6
0
def train(env_id, num_timesteps, hidden_size, num_hidden_layers, seed, rank):
    with U.make_session(3) as sess:
        worker_seed = seed + 10000 * rank
        set_global_seeds(worker_seed)

        # env = bench.Monitor(env, logger.get_dir() and
        # osp.join(logger.get_dir(), str(rank)))

        try:
            env = gym.make(env_id)
            env.seed(worker_seed)

            # Rendering and saving callback
            episode = 0

            def episode_callback(locals, globals):
                nonlocal episode
                episode += 1
                print("----- Episode {} -----".format(episode))
                env.render()
                if episode % 20 == 0:
                    save(sess)

            # Policy function
            policy_fn = lambda name, ob_space, ac_space: MlpPolicy(
                name=name,
                ob_space=env.observation_space,
                ac_space=env.action_space,
                hid_size=hidden_size,
                num_hid_layers=num_hidden_layers
            )

            # Learning
            trpo_mpi.learn(
                env,
                policy_fn,
                timesteps_per_batch=1024,
                max_kl=0.01,
                cg_iters=10,
                cg_damping=0.1,
                max_timesteps=num_timesteps,
                gamma=0.99,
                lam=0.98,
                vf_iters=5,
                vf_stepsize=1e-3,
                callback=episode_callback)
        finally:
            env.close()
Exemplo n.º 7
0
    def policy_fn(name, ob_space, ac_space):
        """Create policy for baselines.

        Args:
            name (str): Policy name.
            ob_space (gym.spaces.Box) : Observation space.
            ac_space (gym.spaces.Box) : Action space.

        Returns:
            baselines.ppo1.mlp_policy: MLP policy for baselines.

        """
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=hyper_parameters['hidden_sizes'][0],
                         num_hid_layers=len(hyper_parameters['hidden_sizes']))
Exemplo n.º 8
0
def load(checkpoint, env_id, hidden_size, num_hidden_layers):
    with U.single_threaded_session() as sess:
        try:
            env = gym.make(env_id)

            policy = MlpPolicy(
                name="pi",
                ob_space=env.observation_space,
                ac_space=env.action_space,
                hid_size=hidden_size,
                num_hid_layers=num_hidden_layers)

            saver = tf.train.Saver()
            saver.restore(sess, checkpoint)

            seg_gen = traj_segment_generator(
                policy, env, 1024, True, human_render=True)

            # Generate trajectory segments until stopped
            for _ in seg_gen:
                pass
        finally:
            env.close()
Exemplo n.º 9
0
 def policy_fn(name, ob_space, ac_space):
     return MlpPolicy(name=name,
                      ob_space=env.observation_space,
                      ac_space=env.action_space,
                      hid_size=32,
                      num_hid_layers=2)
Exemplo n.º 10
0
 def __init__(self, name, ob_space, ac_space, hid_size, num_hid_layers):
     MlpPolicy.__init__(self, name, ob_space, ac_space, hid_size,
                        num_hid_layers)
Exemplo n.º 11
0
 def policy_fn(name, ob_space, ac_space):
     return MlpPolicy(name=name,
                      ob_space=ob_space,
                      ac_space=ac_space,
                      hid_size=hidden_sizes[0],
                      num_hid_layers=len(hidden_sizes))
 def policy_fn(name, ob_space, ac_space):
     return MlpPolicy(name=name,
                      ob_space=env.observation_space,
                      ac_space=env.action_space,
                      hid_size=rl_params['pi_hidden_size'],
                      num_hid_layers=rl_params['pi_hidden_layer'])
Exemplo n.º 13
0
 def __init__(self, name, *args, session=None, **kwargs):
     MlpPolicy.__init__(self, name, *args, **kwargs)
     NetworkSaverMLP.__init__(self, network_id=name)
Exemplo n.º 14
0
    #print(x)
    #print(y)
    line.set_data(*(x, y))
    time = float(i) * 0.0005
    time_text.set_text('time = %.2f' % time)
    return line, time_text


U.make_session(num_cpu=1).__enter__()

env = TwoDofArmEnv(ActiveMuscles='agonist',
                   actionParameterization=True,
                   sim_length=0.2)
pol = MlpPolicy("pi",
                env.observation_space,
                env.action_space,
                hid_size=64,
                num_hid_layers=2)
U.initialize()
U.load_state('reacher')

o = env.reset()

time = 0.
data = np.empty((1, 8))
while time < 5.0:
    print(time)
    ac, vpred = pol.act(False, o)
    o, r, d, look = env.step(ac)

    data = np.append(data, look['data'], axis=0)
Exemplo n.º 15
0
 def policy_fn(name, ob_space, ac_space):
     return MlpPolicy(name=name,
                      ob_space=env.observation_space,
                      ac_space=env.action_space,
                      hid_sizes=[32, 32])