def __init__(self, ctrl_cost_coeff=1e-2, *args, **kwargs):
     MultiDirectionBaseEnv.__init__(self,
                                    ctrl_cost_coeff=ctrl_cost_coeff,
                                    *args,
                                    **kwargs)
     SwimmerEnv.__init__(self,
                         ctrl_cost_coeff=ctrl_cost_coeff,
                         *args,
                         **kwargs)
Exemplo n.º 2
0
def create_env(which_agent):

    # setup environment
    if (which_agent == 0):
        env = PointEnv()
        dt_from_xml = env.model.opt.timestep
        env = normalize(env)
    elif (which_agent == 1):
        env = AntEnv()
        dt_from_xml = env.model.opt.timestep
        env = normalize(env)
    elif (which_agent == 2):
        env = SwimmerEnv()
        dt_from_xml = env.model.opt.timestep
        env = normalize(SwimmerEnv())  #dt 0.001 and frameskip=150
    elif (which_agent == 3):
        env = ReacherEnv()
        dt_from_xml = env.model.opt.timestep
    elif (which_agent == 4):
        env = HalfCheetahEnv()
        dt_from_xml = env.model.opt.timestep
        env = normalize(env)


#     elif(which_agent==5):
#         env = RoachEnv() #this is a personal vrep env
#         dt_from_xml = env.VREP_DT
    elif (which_agent == 6):
        env = HopperEnv()
        dt_from_xml = env.model.opt.timestep
        env = normalize(env)
    elif (which_agent == 7):
        env = Walker2DEnv()
        dt_from_xml = env.model.opt.timestep
        env = normalize(env)

    #get dt value from env - DOES NOT WORK !!!!
    #     if(which_agent==5):
    #         dt_from_xml = env.VREP_DT
    #     else:
    #         dt_from_xml = env.model.opt.timestep
    print("\n\n the dt is: ", dt_from_xml, "\n\n")

    #set vars
    tf.set_random_seed(2)
    gym.logger.setLevel(gym.logging.WARNING)
    dimO = env.observation_space.shape
    dimA = env.action_space.shape
    print('--------------------------------- \nState space dimension: ', dimO)
    print('Action space dimension: ', dimA,
          "\n -----------------------------------")

    return env, dt_from_xml
Exemplo n.º 3
0
def run_task(*_):
    env = normalize(SwimmerEnv())

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=32,
        max_path_length=100,
        epoch_length=1000,
        min_pool_size=10000,
        n_epochs=200,
        discount=0.99,
        scale_reward=0.01,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        plot=True,
    )
    algo.train()
Exemplo n.º 4
0
def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        env = normalize(SwimmerEnv())
    else:
        env = normalize(GymEnv(variant['env_name']))

    pool = SimpleReplayBuffer(
        env_spec=env.spec,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    base_kwargs = dict(
        min_pool_size=variant['max_path_length'],
        epoch_length=variant['epoch_length'],
        n_epochs=variant['n_epochs'],
        max_path_length=variant['max_path_length'],
        batch_size=variant['batch_size'],
        n_train_repeat=variant['n_train_repeat'],
        eval_render=False,
        eval_n_episodes=1,
    )

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=env.spec,
        hidden_layer_sizes=(M, M),
    )
    df = DFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M])  # discriminator, input is the actions.
    vf = VFunction(env_spec=env.spec, hidden_layer_sizes=[M, M])

    policy = StochasticNNPolicy(env_spec=env.spec, hidden_layer_sizes=(M, M))

    algorithm = SQL(
        base_kwargs=base_kwargs,
        env=env,
        pool=pool,
        qf=qf,
        policy=policy,
        kernel_fn=adaptive_isotropic_gaussian_kernel,
        kernel_n_particles=16,
        kernel_update_ratio=0.5,
        value_n_particles=16,
        td_target_update_interval=1000,
        qf_lr=variant['qf_lr'],
        policy_lr=variant['policy_lr'],
        discount=variant['discount'],
        reward_scale=variant['reward_scale'],
        save_full_state=False,
        df=df,
        vf=vf,
        df_lr=1e-3,
        dist=variant['dist'],
    )

    algorithm.train()
Exemplo n.º 5
0
def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        env = normalize(SwimmerEnv())
    elif variant['env_name'] == 'ant-rllab':
        env = normalize(AntEnv())
    elif variant['env_name'] == 'BlocksSimpleXYQ-v0':
        target = [-1.0, 0.0]
        env = bsmp.BlocksSimpleXYQ(multi_goal=variant['blocks_multigoal'],
                                   time_limit=variant['max_path_length'],
                                   env_config=variant['blocks_simple_xml'],
                                   goal=target)
        env = env_wrap.obsTupleWrap(env, add_action_to_obs=False)
        env = gym_env.GymEnv(
            env,
            video_schedule=glob.video_scheduler.video_schedule,
            log_dir=".")
    else:
        env = normalize(GymEnv(variant['env_name']))

    pool = SimpleReplayBuffer(env=env,
                              max_replay_buffer_size=variant['max_pool_size'])

    sampler = SimpleSampler(max_path_length=variant['max_path_length'],
                            min_pool_size=variant['max_path_length'],
                            batch_size=variant['batch_size'])

    base_kwargs = dict(epoch_length=variant['epoch_length'],
                       n_epochs=variant['n_epochs'],
                       n_train_repeat=variant['n_train_repeat'],
                       eval_render=False,
                       eval_n_episodes=1,
                       sampler=sampler)

    M = variant['layer_size']
    qf = NNQFunction(env=env, hidden_layer_sizes=(M, M))

    policy = StochasticNNPolicy(env=env, hidden_layer_sizes=(M, M))

    algorithm = SQL(
        base_kwargs=base_kwargs,
        env=env,
        pool=pool,
        qf=qf,
        policy=policy,
        kernel_fn=adaptive_isotropic_gaussian_kernel,
        kernel_n_particles=variant['kernel_particles'],
        kernel_update_ratio=variant['kernel_update_ratio'],
        value_n_particles=variant['value_n_particles'],
        td_target_update_interval=variant['td_target_update_interval'],
        qf_lr=variant['qf_lr'],
        policy_lr=variant['policy_lr'],
        discount=variant['discount'],
        reward_scale=variant['reward_scale'],
        save_full_state=False)

    algorithm.train()
Exemplo n.º 6
0
def run_task(*_):
    """
    DPG on Swimmer environment
    """
    env = normalize(SwimmerEnv())
    """
    Initialise the policy as a neural network policy
    """
    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32))
    """
    Defining exploration strategy : OUStrategy - 
    """
    """
    This strategy implements the Ornstein-Uhlenbeck process, which adds
    time-correlated noise to the actions taken by the deterministic policy.
    The OU process satisfies the following stochastic differential equation:
    dxt = theta*(mu - xt)*dt + sigma*dWt
    where Wt denotes the Wiener process
    """
    es = OUStrategy(env_spec=env.spec)
    """
    Defining the Q network
    """
    qf = ContinuousMLPQFunction(env_spec=env.spec)

    w = qf.get_param_values(regularizable=True)
    """
    Persistence Length Exploration
    """
    lp = Persistence_Length_Exploration(env=env, qf=qf, policy=policy)
    """
    Using the DDPG algorithm
    """
    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        lp=lp,
        batch_size=32,
        max_path_length=1000,
        epoch_length=1000,
        min_pool_size=10000,
        n_epochs=15000,
        discount=0.99,
        scale_reward=0.01,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        #Uncomment both lines (this and the plot parameter below) to enable plotting
        plot=True,
    )
    """
    Training the networks based on the DDPG algorithm
    """
    algo.train()
Exemplo n.º 7
0
def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        env = normalize(SwimmerEnv())
    elif variant['env_name'] == 'ant-rllab':
        env = normalize(AntEnv())
    elif variant['env_name'] == 'sawyer-rllab':
        env = normalize(SawyerTestEnv())
    elif variant['env_name'] == 'arm3Ddisc-rllab':
        env = normalize(Arm3dDiscEnv())
    else:
        env = normalize(GymEnv(variant['env_name']))

    pool = SimpleReplayBuffer(
        env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size'])

    sampler = SimpleSampler(
        max_path_length=variant['max_path_length'],
        min_pool_size=variant['max_path_length'],
        batch_size=variant['batch_size'])

    base_kwargs = dict(
        epoch_length=variant['epoch_length'],
        n_epochs=variant['n_epochs'],
        n_train_repeat=variant['n_train_repeat'],
        eval_render=False,
        eval_n_episodes=1,
        sampler=sampler)

    M = variant['layer_size']
    qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    policy = StochasticNNPolicy(env_spec=env.spec, hidden_layer_sizes=(M, M))

    algorithm = SQL(
        base_kwargs=base_kwargs,
        env=env,
        pool=pool,
        qf=qf,
        policy=policy,
        kernel_fn=adaptive_isotropic_gaussian_kernel,
        kernel_n_particles=variant['kernel_particles'],
        kernel_update_ratio=variant['kernel_update_ratio'],
        value_n_particles=variant['value_n_particles'],
        td_target_update_interval=variant['td_target_update_interval'],
        qf_lr=variant['qf_lr'],
        policy_lr=variant['policy_lr'],
        discount=variant['discount'],
        reward_scale=variant['reward_scale'],
        save_full_state=False)

    algorithm.train()
def run_experiment(variant):
    env = normalize(SwimmerEnv())

    pool = SimpleReplayBuffer(env_spec=env.spec, max_replay_buffer_size=1e6)

    sampler = SimpleSampler(max_path_length=1000,
                            min_pool_size=1000,
                            batch_size=128)

    base_kwargs = dict(epoch_length=1000,
                       n_epochs=500,
                       n_train_repeat=1,
                       eval_render=False,
                       eval_n_episodes=1,
                       sampler=sampler)

    with tf.Session().as_default():
        data = joblib.load(variant['file'])
        if 'algo' in data.keys():
            saved_qf = data['algo'].qf
            saved_policy = data['algo'].policy
        else:
            saved_qf = data['qf']
            saved_policy = data['policy']

        algorithm = SQL(base_kwargs=base_kwargs,
                        env=env,
                        pool=pool,
                        qf=saved_qf,
                        policy=saved_policy,
                        kernel_fn=adaptive_isotropic_gaussian_kernel,
                        kernel_n_particles=16,
                        kernel_update_ratio=0.5,
                        value_n_particles=16,
                        td_target_update_interval=1000,
                        qf_lr=3E-4,
                        policy_lr=3E-4,
                        discount=0.99,
                        reward_scale=30,
                        use_saved_qf=True,
                        use_saved_policy=True,
                        save_full_state=False)

        algorithm.train()
Exemplo n.º 9
0
    def __init__(self, args):
        self.args = args
        self.device = torch.device(
            'cuda'
        ) if args.cuda and torch.cuda.is_available() else torch.device('cpu')

        if self.args.env_name == 'ant':
            from rllab.envs.mujoco.ant_env import AntEnv
            env = AntEnv()
            # set the target velocity direction (for learning sub-policies)
            env.velocity_dir = self.args.velocity_dir
            env.penalty = self.args.penalty

            # use gym environment observation
            env.use_gym_obs = self.args.use_gym_obs
            # use gym environment reward
            env.use_gym_reward = self.args.use_gym_reward

        elif self.args.env_name == 'swimmer':
            from rllab.envs.mujoco.swimmer_env import SwimmerEnv
            env = SwimmerEnv()
            env.velocity_dir = self.args.velocity_dir
        else:
            raise NotImplementedError

        self.env = normalize(env)
        self.reset_env()

        self.obs_shape = self.env.observation_space.shape

        self.actor_critic = self.select_network().to(self.device)
        self.optimizer = self.select_optimizer()

        # list of RolloutStorage objects
        self.episodes_rollout = []
        # concatenation of all episodes' rollout
        self.rollouts = RolloutStorage(self.device)
        # this directory is used for tensorboardX only
        self.writer = SummaryWriter(args.log_dir + self.args.velocity_dir)

        self.episodes = 0
        self.episode_steps = []
        self.train_rewards = []
Exemplo n.º 10
0
def init(env_name, size, opts):
    global envs
    envs = []
    for i in range(int(size)):
        if env_name == 'SPSwimmer':
            from envs.sp_swimmer_env import SPSwimmerEnv
            envs.append(SPSwimmerEnv())
        elif env_name == 'SPSwimmerGather':
            from envs.sp_swimmer_gather_env import SPSwimmerGatherEnv
            envs.append(SPSwimmerGatherEnv(opts))
        elif env_name == 'SPMountainCar':
            from envs.sp_mountain_car import SPMountainCarEnv
            envs.append(SPMountainCarEnv(opts))
        elif env_name == 'Swimmer':
            from rllab.envs.mujoco.swimmer_env import SwimmerEnv
            envs.append(SwimmerEnv())
        else:
            raise RuntimeError("wrong env name")
        if opts['rllab_normalize_rllab']:
            envs[-1] = NormalizedEnv(env=envs[-1], normalize_obs=True)
Exemplo n.º 11
0
    def _setup_world(self, filename):
        """
        Helper method for handling setup of the MuJoCo world.
        Args:
            filename: Path to XML file containing the world information.
        """
        self._world = []
        self._model = []

        # Initialize Mujoco worlds. If there's only one xml file, create a single world object,
        # otherwise create a different world for each condition.
        for i in range(self._hyperparams['conditions']):
            self._world.append(SwimmerEnv())

        # Initialize x0.
        self.x0 = []
        self._full_init_state = []
        # pdb.set_trace()
        for i in range(self._hyperparams['conditions']):
            self.x0.append(self._world[i].reset())
            self._full_init_state.append(self._world[i].get_full_state())
Exemplo n.º 12
0
def create_env(which_agent):

    # setup environment
    if (which_agent == 0):
        env = normalize(PointEnv())
    elif (which_agent == 1):
        env = normalize(AntEnv())
    elif (which_agent == 2):
        env = normalize(SwimmerEnv())  #dt 0.001 and frameskip=150
    elif (which_agent == 3):
        env = gym.make("modified_gym_env:ReacherPyBulletEnv-v1")
    elif (which_agent == 4):
        env = normalize(HalfCheetahEnv())
    elif (which_agent == 5):
        env = RoachEnv()  #this is a personal vrep env
    elif (which_agent == 6):
        env = normalize(HopperEnv())
    elif (which_agent == 7):
        env = normalize(Walker2DEnv())

    #get dt value from env
    if (which_agent == 5):
        dt_from_xml = env.VREP_DT
    elif (which_agent == 3):
        dt_from_xml = 0.02
    else:
        dt_from_xml = env.model.opt.timestep
    print("\n\n the dt is: ", dt_from_xml, "\n\n")

    #set vars
    tf.set_random_seed(2)
    gym.logger.setLevel(logging.WARN)
    dimO = env.observation_space.shape
    dimA = env.action_space.shape
    print('--------------------------------- \nState space dimension: ', dimO)
    print('Action space dimension: ', dimA,
          "\n -----------------------------------")

    return env, dt_from_xml
Exemplo n.º 13
0
from rllab.algos.ddpg import DDPG
from rllab.envs.normalized_env import normalize
from rllab.misc.instrument import run_experiment_lite
from rllab.exploration_strategies.ou_strategy import OUStrategy
from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy
from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction

from rllab.envs.mujoco.swimmer_env import SwimmerEnv

env = normalize(SwimmerEnv())


def run_task(*_):
    """
    DPG on Swimmer environment
    """
    env = normalize(SwimmerEnv())
    """
    Initialise the policy as a neural network policy
    """
    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32))
    """
    Defining exploration strategy : OUStrategy - 
    """
    """
    This strategy implements the Ornstein-Uhlenbeck process, which adds
    time-correlated noise to the actions taken by the deterministic policy.
    The OU process satisfies the following stochastic differential equation:
Exemplo n.º 14
0
def get(perm):
    name = perm["problem"]
    if name.lower() == "cartpole":
        from rllab.envs.box2d.cartpole_env import CartpoleEnv
        return normalize(CartpoleEnv())

    elif name.lower() == "mountain car height bonus":
        from rllab.envs.box2d.mountain_car_env import MountainCarEnv
        return normalize(MountainCarEnv())

    elif name.lower() == "mountain car":
        from rllab.envs.box2d.mountain_car_env import MountainCarEnv
        return normalize(MountainCarEnv(height_bonus=0))

    elif name.lower() == "gym mountain car":
        from rllab.envs.gym_env import GymEnv
        return normalize(GymEnv("MountainCarContinuous-v0",
                                record_video=False))

    elif name.lower() == "pendulum":
        from rllab.envs.gym_env import GymEnv
        return normalize(GymEnv("Pendulum-v0", record_video=False))

    elif name.lower() == "mujoco double pendulum":
        from rllab.envs.mujoco.inverted_double_pendulum_env import InvertedDoublePendulumEnv
        return normalize(InvertedDoublePendulumEnv())

    elif name.lower() == "double pendulum":
        from rllab.envs.box2d.double_pendulum_env import DoublePendulumEnv
        return normalize(DoublePendulumEnv())

    elif name.lower() == "hopper":
        from rllab.envs.mujoco.hopper_env import HopperEnv
        return normalize(HopperEnv())

    elif name.lower() == "swimmer":
        from rllab.envs.mujoco.swimmer_env import SwimmerEnv
        return normalize(SwimmerEnv())

    elif name.lower() == "2d walker":
        from rllab.envs.mujoco.walker2d_env import Walker2DEnv
        return normalize(Walker2DEnv())

    elif name.lower() == "half cheetah":
        from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv
        return normalize(HalfCheetahEnv())

    elif name.lower() == "ant":
        from rllab.envs.mujoco.ant_env import AntEnv
        return normalize(AntEnv())

    elif name.lower() == "simple humanoid":
        from rllab.envs.mujoco.simple_humanoid_env import SimpleHumanoidEnv
        return normalize(SimpleHumanoidEnv())

    elif name.lower() == "full humanoid":
        from rllab.envs.mujoco.humanoid_env import HumanoidEnv
        return normalize(HumanoidEnv())

    else:
        raise NotImplementedError(f"Environment {name} unknown")
Exemplo n.º 15
0
def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        from rllab.envs.mujoco.humanoid_env import HumanoidEnv
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        from rllab.envs.mujoco.swimmer_env import SwimmerEnv
        env = normalize(SwimmerEnv())
    else:
        env = normalize(GymEnv(variant['env_name']))

    obs_space = env.spec.observation_space
    assert isinstance(obs_space, spaces.Box)
    low = np.hstack([obs_space.low, np.full(variant['num_skills'], 0)])
    high = np.hstack([obs_space.high, np.full(variant['num_skills'], 1)])
    aug_obs_space = spaces.Box(low=low, high=high)
    aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space)
    pool = SimpleReplayBuffer(
        env_spec=aug_env_spec,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    base_kwargs = dict(
        min_pool_size=variant['max_path_length'],
        epoch_length=variant['epoch_length'],
        n_epochs=variant['n_epochs'],
        max_path_length=variant['max_path_length'],
        batch_size=variant['batch_size'],
        n_train_repeat=variant['n_train_repeat'],
        eval_render=False,
        eval_n_episodes=1,
        eval_deterministic=True,
    )

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    vf = NNVFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    policy = GMMPolicy(
        env_spec=aug_env_spec,
        K=variant['K'],
        hidden_layer_sizes=[M, M],
        qf=qf,
        reg=0.001,
    )

    discriminator = NNDiscriminatorFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
        num_skills=variant['num_skills'],
    )


    algorithm = DIAYN_BD(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        discriminator=discriminator,
        pool=pool,
        qf=qf,
        vf=vf,

        lr=variant['lr'],
        scale_entropy=variant['scale_entropy'],
        discount=variant['discount'],
        tau=variant['tau'],
        num_skills=variant['num_skills'],
        save_full_state=False,
        include_actions=variant['include_actions'],
        learn_p_z=variant['learn_p_z'],
        add_p_z=variant['add_p_z'],

        # Additional params for behaviour tracking
        metric=variant['metric'],
        env_id=variant['prefix'],
        eval_freq=variant['eval_freq'],
        log_dir=get_logdir(args, variant),

    )

    algorithm.train()
Exemplo n.º 16
0
def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        from rllab.envs.mujoco.humanoid_env import HumanoidEnv
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        from rllab.envs.mujoco.swimmer_env import SwimmerEnv
        env = normalize(SwimmerEnv())
    else:
        env = normalize(GymEnv(variant['env_name']))
    env = DelayedEnv(env, delay=0.01)

    pool = SimpleReplayBuffer(
        env_spec=env.spec,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    sampler = RemoteSampler(
        max_path_length=variant['max_path_length'],
        min_pool_size=variant['max_path_length'],
        batch_size=variant['batch_size']
    )

    base_kwargs = dict(
        sampler=sampler,
        epoch_length=variant['epoch_length'],
        n_epochs=variant['n_epochs'],
        n_train_repeat=variant['n_train_repeat'],
        eval_render=False,
        eval_n_episodes=1,
        eval_deterministic=True,
    )

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
    )

    vf = NNVFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
    )

    policy = GMMPolicy(
        env_spec=env.spec,
        K=variant['K'],
        hidden_layer_sizes=[M, M],
        qf=qf,
        reparameterize=variant['reparameterize'],
        reg=0.001,
    )
    

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        pool=pool,
        qf=qf,
        vf=vf,

        lr=variant['lr'],
        scale_reward=variant['scale_reward'],
        discount=variant['discount'],
        tau=variant['tau'],

        reparameterize=variant['reparameterize'],
        save_full_state=False,
    )

    algorithm.train()
Exemplo n.º 17
0
def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        from rllab.envs.mujoco.humanoid_env import HumanoidEnv
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        from rllab.envs.mujoco.swimmer_env import SwimmerEnv
        env = normalize(SwimmerEnv())
    elif variant["env_name"] == "Point2D-v0":
        import sac.envs.point2d_env
        env = GymEnv(variant["env_name"])
    else:
        env = normalize(GymEnv(variant['env_name']))

    obs_space = env.spec.observation_space
    assert isinstance(obs_space, spaces.Box)
    low = np.hstack([obs_space.low, np.full(variant['num_skills'], 0)])
    high = np.hstack([obs_space.high, np.full(variant['num_skills'], 1)])
    aug_obs_space = spaces.Box(low=low, high=high)
    aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space)
    pool = SimpleReplayBuffer(
        env_spec=aug_env_spec,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    base_kwargs = dict(min_pool_size=variant['max_path_length'],
                       epoch_length=variant['epoch_length'],
                       n_epochs=variant['n_epochs'],
                       max_path_length=variant['max_path_length'],
                       batch_size=variant['batch_size'],
                       n_train_repeat=variant['n_train_repeat'],
                       eval_render=False,
                       eval_n_episodes=1,
                       eval_deterministic=True,
                       sampler=SimpleSampler(
                           max_path_length=variant["max_path_length"],
                           min_pool_size=variant["max_path_length"],
                           batch_size=variant["batch_size"]))

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    vf = NNVFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    policy = GaussianPolicy(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
        reg=0.001,
    )

    # policy = GMMPolicy(
    #     env_spec=aug_env_spec,
    #     K=variant['K'],
    #     hidden_layer_sizes=[M, M],
    #     qf=qf,
    #     reg=0.001,
    # )

    discriminator = NNDiscriminatorFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
        num_skills=variant['num_skills'],
    )

    algorithm = DIAYN(base_kwargs=base_kwargs,
                      env=env,
                      policy=policy,
                      discriminator=discriminator,
                      pool=pool,
                      qf=qf,
                      vf=vf,
                      lr=variant['lr'],
                      scale_entropy=variant['scale_entropy'],
                      discount=variant['discount'],
                      tau=variant['tau'],
                      num_skills=variant['num_skills'],
                      save_full_state=False,
                      include_actions=variant['include_actions'],
                      learn_p_z=variant['learn_p_z'],
                      add_p_z=variant['add_p_z'],
                      reparametrize=variant["reparametrize"])

    algorithm.train()
Exemplo n.º 18
0
 def __init__(self):
     super(OccludedSwimmerEnv, self).__init__(SwimmerEnv(), [2,3,4]) # joint angles
Exemplo n.º 19
0
from rllab.envs.mujoco.swimmer_env import SwimmerEnv

swimmer = SwimmerEnv()
swimmer.reset()
swimmer.get_current_obs()
Exemplo n.º 20
0
#from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
from sandbox.rocky.tf.policies.minimal_gauss_mlp_policy import GaussianMLPPolicy
from sandbox.rocky.tf.envs.base import TfEnv

stub(globals())
oracle = False
random = True

if oracle:
    env = TfEnv(normalize(SwimmerRandGoalOracleEnv()))
    batch_size = 200
elif random:
    env = TfEnv(normalize(SwimmerRandGoalEnv()))
    batch_size = 200
else:
    env = TfEnv(normalize(SwimmerEnv()))
    batch_size = 20
policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
    hidden_sizes=(100,100),
)
baseline = LinearFeatureBaseline(env_spec=env.spec)
#baseline = ZeroBaseline(env_spec=env.spec)
algo = VPG(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=500*batch_size,
    max_path_length=500,
    n_itr=500,