Exemplo n.º 1
0
 def policy_fn(name, ob_space, ac_space):
     return mlp_policy.MlpPolicy(name=name,
                                 ob_space=ob_space,
                                 ac_space=ac_space,
                                 hid_size=64,
                                 num_hid_layers=3,
                                 gmm_comp=1)
Exemplo n.º 2
0
 def policy_fn(name, ob_space, ac_space, noisy_nets=False):
     return mlp_policy.MlpPolicy(name=name,
                                 ob_space=ob_space,
                                 ac_space=ac_space,
                                 hid_size=hid_size,
                                 num_hid_layers=num_hid_layers,
                                 noisy_nets=noisy_nets)
Exemplo n.º 3
0
 def policy_fn(name, ob_space, ac_space):
     #return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
     return mlp_policy.MlpPolicy(name=name,
                                 ob_space=ob_space,
                                 ac_space=ac_space,
                                 hid_size=64,
                                 num_hid_layers=2)
Exemplo n.º 4
0
 def policy_fn(name, ob_space, ac_space):
     # TODO Ensure that multiple-layers implementation is really solid
     return mlp_policy.MlpPolicy(name=name,
                                 ob_space=ob_space,
                                 ac_space=ac_space,
                                 hid_size=128,
                                 num_hid_layers=2)
Exemplo n.º 5
0
def policy_fn(name, ob_space, ac_space):
    from baselines.ppo1 import mlp_policy
    return mlp_policy.MlpPolicy(name=name,
                                ob_space=ob_space,
                                ac_space=ac_space,
                                hid_size=64,
                                num_hid_layers=2)
Exemplo n.º 6
0
 def policy_fn(name, ob_space, ac_space):
     return mlp_policy.MlpPolicy(name=name,
                                 ob_space=ob_space,
                                 ac_space=ac_space,
                                 hid_size=hid_size,
                                 activation=activation,
                                 interpolate=interpolate)
Exemplo n.º 7
0
def test(env_id, num_episodes, model_path, seed):
    from baselines.ppo1 import mlp_policy
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)

    pi = mlp_policy.MlpPolicy(name='pi',
                              ob_space=env.observation_space,
                              ac_space=env.action_space,
                              hid_size=64,
                              num_hid_layers=2)
    pi_vars = pi.get_variables()
    for v in pi_vars:
        print(v.name)

    saveFromFlat(pi.get_variables(), model_path)
    env = bench.Monitor(env, logger.get_dir())
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)
    ep_rews = []
    ob = env.reset()
    for _ in tqdm(range(num_episodes)):
        ep_rew = 0
        new = False
        while not new:
            env.render()
            ac, vpred = pi.act(stochastic=False, ob=ob)
            ob, rew, new, _ = env.step(ac)
            ep_rew += rew
        ob = env.reset()
        ep_rews.append(ep_rew)
    print("----------- Summary ------------")
    print("episode mean %.3f" % np.mean(ep_rews))

    env.close()
Exemplo n.º 8
0
 def policy_fn(name, ob_space, ac_space):
     return mlp_policy.MlpPolicy(
         name=name,
         ob_space=ob_space,
         ac_space=ac_space,
         hid_size=pi_hid_size,
         num_hid_layers=pi_num_hid_layers)
Exemplo n.º 9
0
 def policy_fn(name, ob_space, ac_space):
     return mlp_policy.MlpPolicy(name=name,
                                 ob_space=ob_space,
                                 ac_space=ac_space,
                                 policy_hid_size=policy_hid_size,
                                 vf_hid_size=vf_hid_size,
                                 activation_policy=activation_policy,
                                 activation_vf=activation_vf)
Exemplo n.º 10
0
 def policy_fn(name, ob_space, ac_space):
     return mlp_policy.MlpPolicy(name=name,
                                 ob_space=ob_space,
                                 ac_space=ac_space,
                                 layers_val=layers_val,
                                 layers_pol=layers_pol,
                                 gaussian_fixed_var=False,
                                 dist=distribution)
 def policy_fn(name, ob_space, ac_space):  # pylint: disable=W0613
     return mlp_policy.MlpPolicy(
         name=name,
         ob_space=ob_space,
         ac_space=ac_space,
         hid_size=64,
         num_hid_layers=3,
     )
 def policy_fn(name, ob_space, ac_space):
     # mlp: Multi-Layer Perceptron
     # state -> (num_hid_layers) fully-connected layers with (hid_size) units -> (action, predicted value)
     return mlp_policy.MlpPolicy(name=name,
                                 ob_space=ob_space,
                                 ac_space=ac_space,
                                 hid_size=64,
                                 num_hid_layers=2)
Exemplo n.º 13
0
 def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None):
     return mlp_policy.MlpPolicy(name=name,
                                 ob_space=ob_space,
                                 ac_space=ac_space,
                                 hid_size=64,
                                 num_hid_layers=2,
                                 sess=sess,
                                 placeholders=placeholders)
 def policy_fn(name, ob_space, ac_space):
     return mlp_policy.MlpPolicy(name=name,
                                 ob_space=ob_space,
                                 ac_space=ac_space,
                                 hid_size=64,
                                 num_hid_layers=2,
                                 num_options=num_options,
                                 dc=dc)
Exemplo n.º 15
0
    def policy_fn(name, ob_space, ac_space):

        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            ## MAIN CHANGES
            hid_size_V=vf_hid_size,
            hid_size_actor=64, num_hid_layers=2,
            V_keep_prob=V_keep_prob,mc_samples=mc_samples,\
            layer_norm=False,activation_critic=activation_vf,\
            activation_actor=tf.nn.relu , dropout_on_V=dropout_on_V, sample_dropout=sample_dropout)
Exemplo n.º 16
0
 def policy_fn(name, ob_space, ac_space):
     return mlp_policy.MlpPolicy(name=name,
                                 ob_space=ob_space,
                                 ac_space=ac_space,
                                 hid_size=64,
                                 num_hid_layers=2,
                                 num_options=args.num_options,
                                 dc=args.dc,
                                 head=args.model_type)
def test_conversion(config_dict):
    expert_policy_config = config_dict.model.expert_policy
    name = '{0}__{1}'.format(config_dict.env.name, expert_policy_config.name)
    model_file_tf = os.path.join(expert_policy_config.save_dir,
                                 '{0}.ckpt'.format(name))
    model_file_th = os.path.join(expert_policy_config.save_dir,
                                 '{0}.th.pt'.format(name))

    env = make_env(config_dict.env.name, config_dict.general.seed)

    pi_tf = mlp_policy.MlpPolicy(
        name='pi',
        ob_space=env.observation_space,
        ac_space=env.action_space,
        hid_size=expert_policy_config.hidden_size,
        num_hid_layers=expert_policy_config.num_layers)
    observations_tf = []
    with U.make_session(num_cpu=expert_policy_config.num_cpu) as sess:
        # Load TF model
        saver = tf.train.Saver(pi_tf.get_variables())
        saver.restore(tf.get_default_session(), model_file_tf)
        # Sample trajectory
        # env.seed(config_dict.general.seed)
        observation, done = env.reset(), False
        observations_tf.append(observation)
        while not done:
            action = pi_tf.act(stochastic=False, ob=observation)[0]
            observation, _, done, _ = env.step(action)
            observations_tf.append(observation)

    pi_th = NormalMLPPolicy(int(np.prod(env.observation_space.shape)),
                            int(np.prod(env.action_space.shape)),
                            expert_policy_config.hidden_size,
                            expert_policy_config.num_layers,
                            nonlinearity=nn.Tanh)
    observations_th = []
    # Load Pytorch model
    with open(model_file_th, 'rb') as f:
        state_dict = torch.load(f)
        pi_th.load_state_dict(state_dict)
    # Sample trajectory
    env.seed(config_dict.general.seed)
    observation, done = env.reset(), False
    observations_th.append(observation)
    while not done:
        observation_tensor = torch.from_numpy(observation).unsqueeze(0).float()
        action_tensor = pi_th(observation_tensor).mean[0]
        action = action_tensor.detach().cpu().numpy()
        observation, _, done, _ = env.step(action)
        observations_th.append(observation)

    # Compare the trajectories
    linf_norm = np.max(
        np.abs(np.asarray(observations_tf) - np.asarray(observations_th)))
    print('Maximum absolute difference between observations: {0}'.format(
        linf_norm))
Exemplo n.º 18
0
 def policy_fn(name, ob_space, ac_space):
     print("Policy with name: ", name)
     policy = mlp_policy.MlpPolicy(name=name,
                                   ob_space=ob_space,
                                   ac_space=ac_space,
                                   hidden_dimension_list=hidden_dimensions)
     saver = tf.train.Saver()
     if initial_params_path is not None:
         saver.restore(sess, initial_params_path)
     return policy
Exemplo n.º 19
0
    def __init__(self, param_path, obs_space, action_space, hid_size,
                 num_hid_layers):
        self.action_space = action_space

        self.actor = mlp_policy.MlpPolicy("pi", obs_space, action_space,
                                          hid_size=hid_size,
                                          num_hid_layers=num_hid_layers)
        U.initialize()
        saver = tf.train.Saver()
        saver.restore(tf.get_default_session(), param_path)
Exemplo n.º 20
0
 def __init__(self, env, sess, restore, batch=TRAINING_BATCH_SIZE):
     self.pi = mlp_policy.MlpPolicy(name='pi',
                                    ob_space=env.observation_space,
                                    ac_space=env.action_space,
                                    hid_size=64,
                                    num_hid_layers=2,
                                    training_batch_size=batch)
     self.saver = tf.train.Saver(var_list=tf.get_collection(
         tf.GraphKeys.GLOBAL_VARIABLES, scope='pi'))
     if restore:
         self.saver.restore(sess, "{0}/teacher.ckpt".format(base_path))
Exemplo n.º 21
0
 def policy_fn(name, ob_space, ac_space):
     return mlp_policy.MlpPolicy(
         name=name,
         ob_space=ob_space,
         ac_space=ac_space,
         hid_sizes=config['hidden_layers'],
         num_hid_layers=len(config['hidden_layers']),
         gaussian_fixed_var=True,
         init_pol_weight_stddev=config['init_pol_weight_stddev'],
         init_val_weight_stddev=config['init_val_weight_stddev'],
         init_logstd=config['init_logstd'])
Exemplo n.º 22
0
def policy_fn(name, ob_space, ac_space):
    if state_self_standardize:
        return mlp_norms_policy.MlpNormsPolicy(name=name,
                                               ob_space=ob_space,
                                               ac_space=ac_space,
                                               hid_size=hsize,
                                               num_hid_layers=layers,
                                               gmm_comp=1)
    else:
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=hsize,
                                    num_hid_layers=layers,
                                    gmm_comp=1)
Exemplo n.º 23
0
 def __init__(self, env, sess, restore, klts):
     self.pi = mlp_policy.MlpPolicy(
         name="s_pi_{0}".format("klts" if klts else "klst"),
         ob_space=env.observation_space,
         ac_space=env.action_space,
         hid_size=64,
         num_hid_layers=2,
         training_batch_size=TRAINING_BATCH_SIZE,
         gaussian_fixed_var=False)
     self.saver = tf.train.Saver(var_list=tf.get_collection(
         tf.GraphKeys.GLOBAL_VARIABLES,
         scope="s_pi_{0}".format("klts" if klts else "klst")))
     if restore:
         self.saver.restore(
             sess,
             "{0}/student_{1}.ckpt".format(base_path,
                                           "klts" if klts else "klst"))
def load_episodes(env_id, seed, model_files):
    with tf.device('/cpu'):
        sess = U.make_session(num_cpu=1)
        sess.__enter__()

        env = gym.make(env_id)
        env.seed(seed)
        # TODO set max episode length
        env._max_episode_steps = EPISODE_MAX_LENGTH

        gym.logger.setLevel(logging.WARN)

        policy_fn = lambda name, ob_space, ac_space: mlp_policy.MlpPolicy(
            name=name,
            ob_space=ob_space,
            ac_space=ac_space,
            hid_size=64,
            num_hid_layers=2)

        pi = policy_fn('pi', env.observation_space, env.action_space)

        render = RENDER
        from time import ctime
        start_time = ctime()

        for model_file in tqdm(model_files):

            # TODO adjust velocity
            env.unwrapped.metadata['target_v'] = 0.1

            time_step = int(model_file[-9:])

            observations, cum_reward, distance, cum_rew_p = run_environment_episode(
                env,
                pi,
                seed,
                model_file,
                env._max_episode_steps,
                render=render,
                stochastic=False)

            save_full_episodes(observations, time_step, distance, cum_reward,
                               cum_rew_p)

        print(start_time)
        print(ctime())
Exemplo n.º 25
0
 def policy_fn(name, ob_space, ac_space):
     if policy == "sigmoid":
         return sigmoid_policy.SigmoidPolicy(name=name,
                                             ob_space=ob_space,
                                             ac_space=ac_space,
                                             hid_size=32,
                                             num_hid_layers=3)
     elif policy == "mlp":
         return mlp_policy.MlpPolicy(name=name,
                                     ob_space=ob_space,
                                     ac_space=ac_space,
                                     hid_size=32,
                                     num_hid_layers=3)
     elif policy == "beta":
         return beta_policy.BetaPolicy(name=name,
                                       ob_space=ob_space,
                                       ac_space=ac_space,
                                       hid_size=32,
                                       num_hid_layers=3)
Exemplo n.º 26
0
    all_episodes_rew_pc = []

    for item in paths_with_var_scops:

        print(list(item.values()))

        var_scope, path = list(item.values())[0]

        with tf.variable_scope(str(var_scope)):

            sess = U.make_session(num_cpu=1)
            sess.__enter__()

            policy_fn = lambda name, ob_space, ac_space: mlp_policy.MlpPolicy(
                name=name,
                ob_space=ob_space,
                ac_space=ac_space,
                hid_size=64,
                num_hid_layers=2)

            pi = policy_fn('pi', env.observation_space, env.action_space)

            gym.logger.setLevel(logging.WARN)

            model_file = get_latest_model_file(path)

            distance_rew = 0

            rew_p = 0

            for s in range(configs["runs_per_model"]):
                single_episode_distance, single_episode_rew_p = run_environment_episode(
Exemplo n.º 27
0
 def policy_fn(name, ob_space, ac_space, params=params):
     return mlp_policy.MlpPolicy(name=name,
                                 ob_space=ob_space,
                                 ac_space=ac_space,
                                 hid_size=int(params[1]),
                                 num_hid_layers=int(params[2]))
Exemplo n.º 28
0
 def policy_fn(name, ob_space, ac_space):
     return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
         hid_size=64, num_hid_layers=2, gaussian_fixed_var=True)
Exemplo n.º 29
0
 def policy_fn(name, ob_space, ac_space, reuse=False):
     return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
         reuse=reuse, hid_size=64, num_hid_layers=2)
Exemplo n.º 30
0
def main(logdir, checkpoint, human_render, num_rollouts, max_episode_length,
         save_videos, save_rollouts, save_separate_rollouts):
    if not osp.exists(osp.join(logdir, 'run.json')):
        raise FileNotFoundError("Could not find run.json.")

    configuration = json.load(open(osp.join(logdir, 'run.json'), 'r'))
    if configuration["settings"]["method"] not in ["trpo", "ppo"]:
        raise NotImplementedError(
            "Playback for %s has not been implemented yet." %
            configuration["method"])

    env = utils.create_environment(configuration["settings"]["environment"])

    # build policy network
    # TODO this needs to be more general
    from baselines.ppo1 import mlp_policy
    tf.Session().__enter__()
    pi = mlp_policy.MlpPolicy(
        name="pi",
        ob_space=env.observation_space,
        ac_space=env.action_space,
        hid_size=configuration["settings"].get('pi_hid_size', 150),
        num_hid_layers=configuration["settings"].get('pi_num_hid_layers', 3))

    # find latest policy checkpoint
    saver = tf.train.Saver()
    if checkpoint is None:
        files = glob.glob(osp.join(logdir, 'checkpoints') + '/*.index')
        files = [(int(re.findall(".*?_(\d+)\.", f)[0]), f) for f in files]
        files = sorted(files, key=operator.itemgetter(0))
        checkpoint = files[-1][1]
    elif not osp.isabs(checkpoint):
        if not osp.exists(osp.join(logdir, 'checkpoints')):
            raise FileNotFoundError("Could not find checkpoints folder")
        else:
            checkpoint = osp.join(logdir, 'checkpoints', checkpoint)
    if checkpoint.endswith(".index"):
        checkpoint = checkpoint[:-len(".index")]
    print("Loading checkpoint %s." % checkpoint)
    saver.restore(tf.get_default_session(), checkpoint)

    # generate rollouts
    rollouts = []
    for i_rollout in tqdm(range(num_rollouts), "Computing rollouts"):
        observation = env.reset()
        rollout = {"observation": [], "reward": [], "action": []}
        video = []
        for i_episode in range(max_episode_length):
            action, _ = pi.act(stochastic=False, ob=observation)
            observation, reward, done, _ = env.step(action)
            if human_render:
                env.render(mode='human')
            if save_videos is not None:
                video.append(env.render(mode='rgb_array'))
            if save_rollouts is not None:
                rollout["observation"].append(observation)
                rollout["reward"].append(reward)
                rollout["action"].append(action)
            if done:
                break

        if save_videos is not None:
            imageio.mimsave(osp.join(save_videos,
                                     'rollout_%i.mp4' % i_rollout),
                            video,
                            fps=env.metadata.get('video.frames_per_second',
                                                 50))
        if save_rollouts is not None and save_separate_rollouts:
            pkl.dump(
                rollout,
                open(osp.join(save_rollouts, 'rollout_%i.pkl' % i_rollout),
                     "wb"))
        else:
            rollouts.append(rollout)

    if save_rollouts is not None and not save_separate_rollouts:
        pkl.dump(rollouts, open(osp.join(save_rollouts, 'rollouts.pkl'), "wb"))