Пример #1
0
def train(env_id, num_timesteps, seed):
    """
    Train TRPO model for the mujoco environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    with tf_util.single_threaded_session():
        rank = MPI.COMM_WORLD.Get_rank()
        if rank == 0:
            logger.configure()
        else:
            logger.configure(format_strs=[])
            logger.set_level(logger.DISABLED)
        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

        tblog = "/cvgl2/u/surajn/workspace/tb_logs/reacher/"
        env = make_mujoco_env(env_id, workerseed)
        model = TRPO(MlpPolicy,
                     env,
                     timesteps_per_batch=1024,
                     max_kl=0.01,
                     cg_iters=10,
                     cg_damping=0.1,
                     entcoeff=0.0,
                     gamma=0.99,
                     lam=0.98,
                     vf_iters=5,
                     vf_stepsize=1e-3,
                     tensorboard_log)
        model.learn(total_timesteps=num_timesteps)
        env.close()
Пример #2
0
 def load(self, path, env):
     if self.trpo():
         return TRPO.load(path, env=env)
     elif self.ppo():
         return PPO2.load(path, env=env)
     else:
         return SAC.load(path, env=env)
Пример #3
0
    def __init__(self, policy, env, expert_dataset=None,
                 hidden_size_adversary=100, adversary_entcoeff=1e-3,
                 g_step=3, d_step=1, d_stepsize=3e-4, verbose=0,
                 _init_setup_model=True, **kwargs):
        super().__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=False,
                         _init_setup_model=_init_setup_model)

        self.trpo = TRPO(policy, env, verbose=verbose, _init_setup_model=False, **kwargs)
        self.trpo.using_gail = True
        self.trpo.expert_dataset = expert_dataset
        self.trpo.g_step = g_step
        self.trpo.d_step = d_step
        self.trpo.d_stepsize = d_stepsize
        self.trpo.hidden_size_adversary = hidden_size_adversary
        self.trpo.adversary_entcoeff = adversary_entcoeff
        self.env = self.trpo.env

        if _init_setup_model:
            self.setup_model()
Пример #4
0
def train(env_id, num_timesteps, seed):
    """
    Train TRPO model for the atari environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    rank = MPI.COMM_WORLD.Get_rank()

    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])

    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = make_atari(env_id)

    # def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None):  # pylint: disable=W0613
    #     return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess, placeholders=placeholders)

    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    env = wrap_deepmind(env)
    env.seed(workerseed)

    model = TRPO(CnnPolicy,
                 env,
                 timesteps_per_batch=512,
                 max_kl=0.001,
                 cg_iters=10,
                 cg_damping=1e-3,
                 entcoeff=0.0,
                 gamma=0.98,
                 lam=1,
                 vf_iters=3,
                 vf_stepsize=1e-4)
    model.learn(total_timesteps=int(num_timesteps * 1.1))
    env.close()
    def __init__(self,
                 policy,
                 env,
                 pretrained_weight=False,
                 hidden_size_adversary=100,
                 adversary_entcoeff=1e-3,
                 expert_dataset=None,
                 save_per_iter=1,
                 checkpoint_dir="/tmp/gail/ckpt/",
                 g_step=1,
                 d_step=1,
                 task_name="task_name",
                 d_stepsize=3e-4,
                 verbose=0,
                 _init_setup_model=True,
                 **kwargs):

        super().__init__(policy=policy,
                         env=env,
                         verbose=verbose,
                         requires_vec_env=False,
                         _init_setup_model=_init_setup_model)

        self.trpo = TRPO(policy,
                         env,
                         verbose=verbose,
                         _init_setup_model=False,
                         **kwargs)
        self.trpo.using_gail = True
        self.trpo.pretrained_weight = pretrained_weight
        self.trpo.expert_dataset = expert_dataset
        self.trpo.save_per_iter = save_per_iter
        self.trpo.checkpoint_dir = checkpoint_dir
        self.trpo.g_step = g_step
        self.trpo.d_step = d_step
        self.trpo.task_name = task_name
        self.trpo.d_stepsize = d_stepsize
        self.trpo.hidden_size_adversary = hidden_size_adversary
        self.trpo.adversary_entcoeff = adversary_entcoeff

        if _init_setup_model:
            self.setup_model()
Пример #6
0
    def create_learner(self, env, parameters):
        if (self.trpo() or self.ppo()) and not issubclass(type(env), VecEnv):
            env = DummyVecEnv([lambda: env])

        if self.trpo():
            model = TRPO(MlpPolicy, env, **parameters["common"],
                         **parameters[str(self)])
            interface = TRPOInterface(model, env.observation_space.shape[0])
        elif self.ppo():
            model = PPO2(MlpPolicy, env, **parameters["common"],
                         **parameters[str(self)])
            interface = PPOInterface(model, env.observation_space.shape[0])
        else:
            model = SAC(SACMlpPolicy, env, **parameters["common"],
                        **parameters[str(self)])
            interface = SACInterface(model, env.observation_space.shape[0])

        if "pretrain_data_path" in parameters:
            data_path = parameters["pretrain_data_path"]
            model.pretrain(ExpertDataset(expert_path=data_path, verbose=0),
                           n_epochs=25)

        return model, interface
Пример #7
0
def train(params):
    rank = MPI.COMM_WORLD.Get_rank()

    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])

    # setup config
    if params.get("policy") == 'mlp':
        policy = MlpPolicy
        env = gym.make(params.get("environment"))
        env.configure(envConfig)
        env.reset()
    else:
        policy = CnnPolicy
        env = gym.make(params.get("environment"))
        env.configure(CnnNet)
        env.reset()

    exp_name = ("{0}_{1}_{2}".format(params.get("model_name"),
                                     params.get("policy"),
                                     params.get("environment")))

    log_dir = './logs/' + exp_name

    if params.get("seed") > 0:
        workerseed = params.get("seed"), +10000 * MPI.COMM_WORLD.Get_rank()
        set_global_seeds(workerseed)
        env.seed(workerseed)

    # create model
    model = TRPO(policy,
                 env,
                 verbose=1,
                 tensorboard_log=log_dir,
                 timesteps_per_batch=params.get("timesteps_per_batch"),
                 max_kl=params.get("max_kl"),
                 cg_iters=params.get("cg_iters"),
                 cg_damping=params.get("cg_damping"),
                 entcoeff=params.get("entcoeff"),
                 gamma=params.get("gamma"),
                 lam=params.get("lam"),
                 vf_iters=params.get("vf_iters"),
                 vf_stepsize=params.get("vf_stepsize")
                 # ,policy_kwargs=policy_kwargs
                 )

    model.learn(total_timesteps=params.get("train_steps"))
    model.save(exp_name)
    env.close()
    del env
Пример #8
0
def main():
    """
    Runs the test
    """
    """
    Create an argparse.ArgumentParser for run_mujoco.py.

    :return:  (ArgumentParser) parser {'--env': 'Reacher-v2', '--seed': 0, '--num-timesteps': int(1e6), '--play': False}

    parser = arg_parser()
    parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--num-timesteps', type=int, default=int(1e6))
    parser.add_argument('--play', default=False, action='store_true')
    return parse
    """
    env_id = 'UR5Gripper-v0'
    model_path = '/tmp/gym/trpo_mpi/'
    # args = mujoco_arg_parser().parse_args()
    # train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
    # train(env_id=env_id, num_timesteps=int(1e7), seed=0, model_path=model_path)
    env = gym.make(env_id)
    env = Monitor(env, model_path, allow_early_resets=True)
    model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=model_path)
    model = model.load(model_path + "trpo.pkl")
    model.learn(total_timesteps=int(1e5), callback=callback)
    model.save(model_path + "trpo.pkl")
    # tf_util.save_state(model_path)

    # Enjoy trained agent
    obs = env.reset()
    for i in range(100):
        obs = env.reset()
        env.render()
        for i in range(200):
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            env.render()
Пример #9
0
def train(env_id, num_timesteps, seed, algorithm, model_save_file=None, log_dir=None):

    with tf_util.single_threaded_session():
        logger.configure(folder=log_dir, format_strs=['stdout', 'log', 'csv'])

        workerseed = seed + MPI.COMM_WORLD.Get_rank()
        env = make_mujoco_env(env_id, workerseed)

        if algorithm == "TRPO":
            model = TRPO(MlpPolicy, env, seed=workerseed, verbose=1)
        else:
            # Algorithm is PPO
            model = PPO1(MlpPolicy, env, seed=workerseed, verbose=1)

        model.learn(total_timesteps=num_timesteps)

        if model_save_file is not None:
            model.save(model_save_file)

        env.close()
                     verbose=1,
                     seed=seed,
                     avec_coef=1.,
                     vf_coef=0.,
                     tensorboard_log=log_dir)
        model.learn(total_timesteps=10000, tb_log_name="tb/AVEC-PPO")
        # model.learn(total_timesteps=1000000, tb_log_name="tb/PPO")

        ######################## TRPO ###########################
        log_dir = "./logs/%s/AVEC-TRPO_%s" % (env_id, seed)
        # log_dir = "./logs/%s/TRPO_%s" % (env_id, seed)
        os.makedirs(log_dir, exist_ok=True)
        env = make_vec_env(env_id, 1, seed, monitor_dir=log_dir)
        model = TRPO('MlpPolicy',
                     env,
                     verbose=1,
                     avec_coef=1.,
                     vf_coef=0.,
                     tensorboard_log=log_dir)
        model.learn(total_timesteps=10000, tb_log_name="tb/AVEC-TRPO")
        # model.learn(total_timesteps=1000000, tb_log_name="tb/TRPO")

        ######################### SAC #############################
        log_dir = "./logs/%s/AVEC-SAC_%s" % (env_id, seed)
        # log_dir = "./logs/%s/SAC_%s" % (env_id, seed)
        os.makedirs(log_dir, exist_ok=True)
        env = make_vec_env(env_id, 1, seed, monitor_dir=log_dir)
        model = SAC('CustomSACPolicy',
                    env,
                    verbose=1,
                    avec_coef=1.,
                    value_coef=0.,
Пример #11
0
class GAIL(ActorCriticRLModel):
    """
    Generative Adversarial Imitation Learning (GAIL)

    .. warning::

        Images are not yet handled properly by the current implementation


    :param policy: (ActorCriticPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, CnnLstmPolicy, ...)
    :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
    :param expert_dataset: (ExpertDataset) the dataset manager
    :param gamma: (float) the discount value
    :param timesteps_per_batch: (int) the number of timesteps to run per batch (horizon)
    :param max_kl: (float) the kullback leiber loss threashold
    :param cg_iters: (int) the number of iterations for the conjugate gradient calculation
    :param lam: (float) GAE factor
    :param entcoeff: (float) the weight for the entropy loss
    :param cg_damping: (float) the compute gradient dampening factor
    :param vf_stepsize: (float) the value function stepsize
    :param vf_iters: (int) the value function's number iterations for learning
    :param hidden_size: ([int]) the hidden dimension for the MLP
    :param g_step: (int) number of steps to train policy in each epoch
    :param d_step: (int) number of steps to train discriminator in each epoch
    :param d_stepsize: (float) the reward giver stepsize
    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
        WARNING: this logging can take a lot of space quickly
    """

    def __init__(self, policy, env, expert_dataset=None,
                 hidden_size_adversary=100, adversary_entcoeff=1e-3,
                 g_step=3, d_step=1, d_stepsize=3e-4, verbose=0,
                 _init_setup_model=True, **kwargs):
        super().__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=False,
                         _init_setup_model=_init_setup_model)

        self.trpo = TRPO(policy, env, verbose=verbose, _init_setup_model=False, **kwargs)
        self.trpo.using_gail = True
        self.trpo.expert_dataset = expert_dataset
        self.trpo.g_step = g_step
        self.trpo.d_step = d_step
        self.trpo.d_stepsize = d_stepsize
        self.trpo.hidden_size_adversary = hidden_size_adversary
        self.trpo.adversary_entcoeff = adversary_entcoeff
        self.env = self.trpo.env

        if _init_setup_model:
            self.setup_model()

    def _get_pretrain_placeholders(self):
        pass

    def pretrain(self, dataset, n_epochs=10, learning_rate=1e-4,
                 adam_epsilon=1e-8, val_interval=None):
        self.trpo.pretrain(dataset, n_epochs=n_epochs, learning_rate=learning_rate,
                           adam_epsilon=adam_epsilon, val_interval=val_interval)
        return self

    def set_env(self, env):
        self.trpo.set_env(env)
        self.env = self.trpo.env

    def setup_model(self):
        assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the GAIL model must be an " \
                                                           "instance of common.policies.ActorCriticPolicy."
        self.trpo.setup_model()

    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="GAIL",
              reset_num_timesteps=True):
        assert self.trpo.expert_dataset is not None, "You must pass an expert dataset to GAIL for training"
        self.trpo.learn(total_timesteps, callback, seed, log_interval, tb_log_name, reset_num_timesteps)
        return self

    def predict(self, observation, state=None, mask=None, deterministic=False):
        return self.trpo.predict(observation, state=state, mask=mask, deterministic=deterministic)

    def action_probability(self, observation, state=None, mask=None, actions=None):
        return self.trpo.action_probability(observation, state=state, mask=mask, actions=actions)

    def save(self, save_path):
        self.trpo.save(save_path)

    @classmethod
    def load(cls, load_path, env=None, **kwargs):
        data, params = cls._load_from_file(load_path)

        model = cls(policy=data["policy"], env=None, _init_setup_model=False)
        model.trpo.__dict__.update(data)
        model.trpo.__dict__.update(kwargs)
        model.set_env(env)
        model.setup_model()

        restores = []
        for param, loaded_p in zip(model.trpo.params, params):
            restores.append(param.assign(loaded_p))
        model.trpo.sess.run(restores)

        return model
Пример #12
0
        total_timesteps=10000, seed=0),
    lambda e: ACKTR(policy=MlpPolicy, env=e, learning_rate=5e-4, n_steps=1
                    ).learn(total_timesteps=20000, seed=0),
    lambda e: DeepQ(policy=deepq_models.mlp([32]),
                    batch_size=16,
                    gamma=0.1,
                    exploration_fraction=0.001,
                    env=e).learn(total_timesteps=40000, seed=0),
    lambda e: PPO1(policy=MlpPolicy,
                   env=e,
                   lam=0.7,
                   optim_batchsize=16,
                   optim_stepsize=1e-3).learn(total_timesteps=10000, seed=0),
    lambda e: PPO2(policy=MlpPolicy, env=e, learning_rate=1.5e-3, lam=0.8
                   ).learn(total_timesteps=20000, seed=0),
    lambda e: TRPO(policy=MlpPolicy, env=e, max_kl=0.05, lam=0.7).learn(
        total_timesteps=10000, seed=0),
]


@pytest.mark.slow
@pytest.mark.parametrize("learn_func", learn_func_list)
def test_identity(learn_func):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)

    :param learn_func: (lambda (Gym Environment): A2CPolicy) the policy generator
    """
    env = DummyVecEnv([lambda: IdentityEnv(10)])

    model = learn_func(env)
class GAIL(ActorCriticRLModel):
    """
    Generative Adversarial Imitation Learning (GAIL)

    :param policy: (ActorCriticPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, CnnLstmPolicy, ...)
    :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
    :param gamma: (float) the discount value
    :param timesteps_per_batch: (int) the number of timesteps to run per batch (horizon)
    :param max_kl: (float) the kullback leiber loss threashold
    :param cg_iters: (int) the number of iterations for the conjugate gradient calculation
    :param lam: (float) GAE factor
    :param entcoeff: (float) the weight for the entropy loss
    :param cg_damping: (float) the compute gradient dampening factor
    :param vf_stepsize: (float) the value function stepsize
    :param vf_iters: (int) the value function's number iterations for learning
    :param pretrained_weight: (str) the save location for the pretrained weights
    :param hidden_size: ([int]) the hidden dimension for the MLP
    :param expert_dataset: (Dset) the dataset manager
    :param save_per_iter: (int) the number of iterations before saving
    :param checkpoint_dir: (str) the location for saving checkpoints
    :param g_step: (int) number of steps to train policy in each epoch
    :param d_step: (int) number of steps to train discriminator in each epoch
    :param task_name: (str) the name of the task (can be None)
    :param d_stepsize: (float) the reward giver stepsize
    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
    """
    def __init__(self,
                 policy,
                 env,
                 pretrained_weight=False,
                 hidden_size_adversary=100,
                 adversary_entcoeff=1e-3,
                 expert_dataset=None,
                 save_per_iter=1,
                 checkpoint_dir="/tmp/gail/ckpt/",
                 g_step=1,
                 d_step=1,
                 task_name="task_name",
                 d_stepsize=3e-4,
                 verbose=0,
                 _init_setup_model=True,
                 **kwargs):

        super().__init__(policy=policy,
                         env=env,
                         verbose=verbose,
                         requires_vec_env=False,
                         _init_setup_model=_init_setup_model)

        self.trpo = TRPO(policy,
                         env,
                         verbose=verbose,
                         _init_setup_model=False,
                         **kwargs)
        self.trpo.using_gail = True
        self.trpo.pretrained_weight = pretrained_weight
        self.trpo.expert_dataset = expert_dataset
        self.trpo.save_per_iter = save_per_iter
        self.trpo.checkpoint_dir = checkpoint_dir
        self.trpo.g_step = g_step
        self.trpo.d_step = d_step
        self.trpo.task_name = task_name
        self.trpo.d_stepsize = d_stepsize
        self.trpo.hidden_size_adversary = hidden_size_adversary
        self.trpo.adversary_entcoeff = adversary_entcoeff

        if _init_setup_model:
            self.setup_model()

    def set_env(self, env):
        super().set_env(env)
        self.trpo.set_env(env)

    def setup_model(self):
        assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the GAIL model must be an " \
                                                           "instance of common.policies.ActorCriticPolicy."
        assert isinstance(
            self.action_space,
            gym.spaces.Box), "Error: GAIL requires a continuous action space."

        self.trpo.setup_model()

    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="GAIL"):
        self.trpo.learn(total_timesteps, callback, seed, log_interval,
                        tb_log_name)
        return self

    def predict(self, observation, state=None, mask=None, deterministic=False):
        return self.trpo.predict(observation,
                                 state,
                                 mask,
                                 deterministic=deterministic)

    def action_probability(self, observation, state=None, mask=None):
        return self.trpo.action_probability(observation, state, mask)

    def save(self, save_path):
        self.trpo.save(save_path)

    @classmethod
    def load(cls, load_path, env=None, **kwargs):
        data, params = cls._load_from_file(load_path)

        model = cls(policy=data["policy"], env=None, _init_setup_model=False)
        model.trpo.__dict__.update(data)
        model.trpo.__dict__.update(kwargs)
        model.set_env(env)
        model.setup_model()

        restores = []
        for param, loaded_p in zip(model.trpo.params, params):
            restores.append(param.assign(loaded_p))
        model.trpo.sess.run(restores)

        return model
Пример #14
0
def main(env, load_path, fig_path):

    # arguments
    print("env %s; load_path %s; fig_path %s;" % (env, load_path, fig_path))
    log_path = os.getcwd() + "/log/" + load_path
    os.makedirs(os.getcwd() + "/figs/" + "/", exist_ok=True)
    fig_path = os.getcwd() + "/figs/" + "/" + fig_path
    load_path = os.getcwd() + "/models/" + load_path

    # make environment, flattened environment, vectorized environment
    env = gym.make(env)
    env = gym.wrappers.FlattenDictWrapper(
        env, ['observation', 'achieved_goal', 'desired_goal'])
    env = DummyVecEnv([lambda: env])

    # load model
    model = TRPO.load(load_path, env=env)
    obs_initial = env.reset()
    obs = obs_initial

    # plot results
    plot_results(fig_path, log_path)

    # initializations
    niter = 10
    counter = 0
    timestep = 0
    results = [[[0, 0, 0] for i in range(100)],
               [[0, 0, 0, 0] for i in range(100)]]
    current = [[[0, 0, 0] for i in range(100)],
               [[0, 0, 0, 0] for i in range(100)]]
    print("==============================")

    # check initial positions and quaternions
    print("grip", env.envs[0].env.env.sim.data.get_site_xpos('grip'))
    print("box", env.envs[0].env.env.sim.data.get_site_xpos('box'))
    print("tool", env.envs[0].env.env.sim.data.get_site_xpos('tool'))
    print("mocap", env.envs[0].env.env.sim.data.mocap_pos)
    print("quat", env.envs[0].env.env.sim.data.mocap_quat)
    print("==============================")

    # mocap quaternion check
    for i in range(5):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        quat = env.envs[0].env.env.sim.data.mocap_quat
        print("obs", obs)
        print("quat", quat)
    print("==============================")

    # start rendering
    dists = []
    box_goal_pos = np.array([0.6, 0.05, -0.17])
    while True:
        if counter == niter:
            break
        action, _states = model.predict(obs)
        obs_old = obs
        obs, rewards, dones, info = env.step(action)
        quaternion = env.envs[0].env.env.sim.data.mocap_quat
        if obs.all() == obs_initial.all():
            if counter % 10 == 0:
                xyzs = current[0]
                quats = current[1]
                print(xyzs)
                print(quats)
                filename = log_path + "/" + "results_" + str(counter) + ".txt"
                os.makedirs(log_path + "/", exist_ok=True)
                file = open(filename, 'w+')
                for xyz, quat in zip(xyzs, quats):
                    for coord in xyz:
                        file.write(str(coord) + " ")
                    for quat_coord in quat:
                        file.write(str(quat_coord) + " ")
                    file.write("\n")
                file.close()

            box_end_pos = np.array(obs_old[0][3:6].tolist())
            print(box_end_pos)
            print(np.shape(box_end_pos))
            print(box_goal_pos)
            print(np.shape(box_goal_pos))
            dists.append(np.linalg.norm(box_goal_pos - box_end_pos))
            current = [[[0, 0, 0] for i in range(100)],
                       [[0, 0, 0, 0] for i in range(100)]]
            timestep = 0
            counter += 1
        print(timestep)
        print("obs", obs)
        print("quat", quaternion)

        # for average trajectory, smoothed
        for i in range(3):
            results[0][timestep][i] += obs[0][:3].tolist()[i]
        for j in range(4):
            results[1][timestep][j] += quaternion[0].tolist()[j]

        # for current trajectory
        for i in range(3):
            current[0][timestep][i] += obs[0][:3].tolist()[i]
        for j in range(4):
            current[1][timestep][j] += quaternion[0].tolist()[j]

        timestep += 1
        env.render()

    # smooth paths by taking average, and calculate mean distance to goal state
    for timestep in range(100):
        for i in range(3):
            results[0][timeste][i] /= niter
        for j in range(4):
            results[0][timestep][j] /= niter
    dist = np.mean(dists)

    # print and write to file
    xyzs = results[0]
    quats = results[1]
    filename = log_path + "/" + "results_avg.txt"
    os.makedirs(log_path + "/", exist_ok=True)
    file = open(filename, 'w+')
    for xyz, quat in zip(xyzs, quats):
        for coord in xyz:
            file.write(str(coord) + " ")
        for quat_coord in quat:
            file.write(str(quat_coord) + " ")
        file.write("\n")
    file.close()

    # print average distances
    print("average distance of box from end goal: %f" % dist)
Пример #15
0
def main():
    """
    Runs the test
    """
    parser = mujoco_arg_parser()
    parser.add_argument(
        '--model-path',
        default="/cvgl2/u/surajn/workspace/saved_models/sawyerlift_ppo2/model")
    parser.add_argument('--images', default=False)
    args = parser.parse_args()

    logger.configure()
    if not args.play:
        model, env = train(args.env,
                           num_timesteps=args.num_timesteps,
                           seed=args.seed,
                           model_path=args.model_path,
                           images=args.images)

    if args.play:

        def make_env():
            env_out = GymWrapper(
                suite.make(
                    "SawyerLift",
                    use_camera_obs=False,  # do not use pixel observations
                    has_offscreen_renderer=
                    False,  # not needed since not using pixel obs
                    has_renderer=True,  # make sure we can render to the screen
                    reward_shaping=True,  # use dense rewards
                    control_freq=
                    10,  # control should happen fast enough so that simulation looks smooth
                ))
            env_out.reward_range = None
            env_out.metadata = None
            env_out.spec = None
            env_out = bench.Monitor(env_out,
                                    logger.get_dir(),
                                    allow_early_resets=True)
            return env_out

        #env = make_env()
        env = DummyVecEnv([make_env])
        env = VecNormalize(env)

        policy = MlpPolicy
        #model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
        #         optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=1)
        model = TRPO(MlpPolicy,
                     env,
                     timesteps_per_batch=1024,
                     max_kl=0.01,
                     cg_iters=10,
                     cg_damping=0.1,
                     entcoeff=0.0,
                     gamma=0.99,
                     lam=0.98,
                     vf_iters=5,
                     vf_stepsize=1e-3)
        model.load(args.model_path)
        logger.log("Running trained model")
        obs = np.zeros((env.num_envs, ) + env.observation_space.shape)
        obs[:] = env.reset()
        while True:
            env.render()
            actions = model.step(obs)[0]
            obs[:] = env.step(actions)[0]
Пример #16
0
import pytest

from stable_baselines.a2c import A2C
from stable_baselines.ppo1 import PPO1
from stable_baselines.ppo2 import PPO2
from stable_baselines.trpo_mpi import TRPO
from stable_baselines.common.identity_env import IdentityEnvMultiBinary, IdentityEnvMultiDiscrete
from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from stable_baselines.common.policies import MlpPolicy

MODEL_FUNC_LIST = [
    lambda e: A2C(policy=MlpPolicy, env=e),
    lambda e: PPO1(policy=MlpPolicy, env=e),
    lambda e: PPO2(policy=MlpPolicy, env=e),
    lambda e: TRPO(policy=MlpPolicy, env=e),
]


@pytest.mark.slow
@pytest.mark.parametrize("model_func", MODEL_FUNC_LIST)
def test_identity_multidiscrete(model_func):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    with a multidiscrete action space

    :param model_func: (lambda (Gym Environment): BaseRLModel) the model generator
    """
    env = DummyVecEnv([lambda: IdentityEnvMultiDiscrete(10)])

    model = model_func(env)