Exemplo n.º 1
0
def example(variant):
    env = HalfCheetahEnv()
    if variant['normalize']:
        env = normalize(env)
    es = OUStrategy(action_space=env.action_space)
    qf = FeedForwardQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        32,
        32,
    )
    policy = FeedForwardPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        32,
        32,
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(env,
                     qf=qf,
                     policy=policy,
                     exploration_policy=exploration_policy,
                     **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
 def test_dead_grads(self):
     self.env = HalfCheetahEnv()
     algo = NAF(
         self.env,
         self.es,
         QuadraticNAF(name_or_scope='qf', env_spec=self.env.spec),
         n_epochs=0,
     )
     qf = algo.qf
     af = qf.advantage_function
     L_param_gen = af._L_computer
     L = af.L
     last_bs = L_param_gen.get_params_internal()[-1]
     grads_ops = tf.gradients(af.output, last_bs)
     a = np.random.rand(1, algo.action_dim)
     o = np.random.rand(1, algo.observation_dim)
     grads = self.sess.run(grads_ops, {
         qf.action_input: a,
         qf.observation_input: o,
     })[0]
     bs = self.sess.run(last_bs)
     num_elems = bs.size
     length = int(math.sqrt(float(num_elems)))
     expected_zero = length * (length - 1) / 2
     num_zero = np.sum((grads == 0.))
     self.assertAlmostEqual(expected_zero, num_zero)
Exemplo n.º 3
0
            def run_task(*_):
                env = normalize(HalfCheetahEnv())

                policy = DeterministicMLPPolicy(
                    env_spec=env.spec,
                    # The neural network policy should have two hidden layers, each with 32 hidden units.
                    hidden_sizes=(H_layer_first[h], H_layer_second[h]))

                es = OUStrategy(env_spec=env.spec)

                qf = ContinuousMLPQFunction(env_spec=env.spec)

                algo = DDPG(
                    env=env,
                    policy=policy,
                    es=es,
                    qf=qf,
                    batch_size=size_of_batch,
                    max_path_length=100,
                    epoch_length=1000,
                    min_pool_size=10000,
                    n_epochs=number_of_episodes,
                    discount=discount_factor,
                    scale_reward=reward_scaling[r],
                    qf_learning_rate=critic_learning_rate[c],
                    policy_learning_rate=actor_learning_rate[c],
                    # Uncomment both lines (this and the plot parameter below) to enable plotting
                    # plot=True,
                )
                algo.train()
def main():
    stub(globals())

    for seed in range(3):
        env = TfEnv(HalfCheetahEnv())
        es = GaussianStrategy(env.spec)
        policy = DeterministicMLPPolicy(
            name="init_policy",
            env_spec=env.spec,
            hidden_sizes=(100, 100),
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh,
        )
        qf = ContinuousMLPQFunction(name="qf",
                                    env_spec=env.spec,
                                    hidden_sizes=(100, 100))
        ddpg_params = dict(
            batch_size=4,
            n_epochs=100,
            epoch_length=50,
            eval_samples=50,
            max_path_length=10,
            min_pool_size=5,
        )
        algorithm = DDPG(env, policy, qf, es, **ddpg_params)

        for _ in range(3):
            run_experiment_lite(
                algorithm.train(),
                n_parallel=1,
                snapshot_mode="last",
                exp_prefix="check-rllab-ddpg-seed",
                seed=seed,
                variant={"seed": seed},
            )
Exemplo n.º 5
0
def main():
    stub(globals())
    env = TfEnv(HalfCheetahEnv())
    for seed in range(3):
        ddpg_params = dict(
            batch_size=128,
            n_epochs=100,
            epoch_length=10000,
            eval_samples=10000,
            discount=0.99,
            policy_learning_rate=1e-4,
            qf_learning_rate=1e-3,
            soft_target_tau=0.01,
            replay_pool_size=1000000,
            min_pool_size=256,
            scale_reward=1.0,
            max_path_length=1000,
            qf_weight_decay=0.0,
        )
        vitchyr_es = OUStrategy(env_spec=env.spec)
        vitchyr_qf = FeedForwardCritic(
            name_or_scope="critic",
            env_spec=env.spec,
        )
        vitchyr_policy = FeedForwardPolicy(
            name_or_scope="actor",
            env_spec=env.spec,
        )
        vitchyr_ddpg = DDPG(env, vitchyr_es, vitchyr_policy, vitchyr_qf,
                            **ddpg_params)

        shane_es = GaussianStrategy(env.spec)
        shane_policy = DeterministicMLPPolicy(
            name="init_policy",
            env_spec=env.spec,
            hidden_sizes=(100, 100),
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh,
        )
        shane_qf = ContinuousMLPQFunction(name="qf",
                                          env_spec=env.spec,
                                          hidden_sizes=(100, 100))
        shane_ddpg = ShaneDDPG(env, shane_policy, shane_qf, shane_es,
                               **ddpg_params)

        names_and_algos = [
            ("Vitchyr_DDPG", vitchyr_ddpg),
            ("Shane_DDPG", shane_ddpg),
        ]
        for name, algorithm in names_and_algos:
            env.reset()
            run_experiment_lite(
                algorithm.train(),
                n_parallel=1,
                snapshot_mode="last",
                exp_prefix="ddpg-comparison-cheetah",
                seed=seed,
            )
Exemplo n.º 6
0
def example(variant):
    load_policy_file = variant.get('load_policy_file', None)
    if load_policy_file is not None and exists(load_policy_file):
        with tf.Session():
            data = joblib.load(load_policy_file)
            print(data)
            policy = data['policy']
            qf = data['qf']
            replay_buffer = data['pool']
        env = HalfCheetahEnv()
        es = OUStrategy(action_space=env.action_space)
        use_new_version = variant['use_new_version']
        algorithm = DDPG(
            env,
            es,
            policy,
            qf,
            n_epochs=2,
            batch_size=1024,
            replay_pool=replay_buffer,
            use_new_version=use_new_version,
        )
        algorithm.train()
    else:
        env = HalfCheetahEnv()
        es = OUStrategy(action_space=env.action_space)
        qf = FeedForwardCritic(
            name_or_scope="critic",
            env_spec=env.spec,
        )
        policy = FeedForwardPolicy(
            name_or_scope="actor",
            env_spec=env.spec,
        )
        use_new_version = variant['use_new_version']
        algorithm = DDPG(
            env,
            es,
            policy,
            qf,
            n_epochs=2,
            batch_size=1024,
            use_new_version=use_new_version,
        )
        algorithm.train()
Exemplo n.º 7
0
            def run_task(*_):

                env = normalize(HalfCheetahEnv())

                policy = DeterministicMLPPolicy(
                    env_spec=env.spec,
                    # The neural network policy should have two hidden layers, each with 32 hidden units.
                    hidden_sizes=(32, 32))

                es = OUStrategy(env_spec=env.spec)

                qf = ContinuousMLPQFunction(env_spec=env.spec,
                                            hidden_sizes=(32, 32))
                """
                Persistence Length Exploration
                """
                lp = Persistence_Length_Exploration(
                    env=env,
                    qf=qf,
                    policy=policy,
                    L_p=L_p_param[l_p_ind],
                    b_step_size=b_step_size[b_ind],
                    sigma=sigma_param[s_ind],
                    max_exploratory_steps=max_exploratory_steps_iters,
                    batch_size=batch_size_value,
                    n_epochs=num_episodes,
                    scale_reward=0.01,
                    epoch_length=steps_per_episode,
                    qf_learning_rate=0.001,
                    policy_learning_rate=0.0001,
                )
                """
                DDPG
                """
                algo = DDPG(
                    env=env,
                    policy=policy,
                    es=es,
                    qf=qf,
                    lp=lp,
                    batch_size=batch_size_value,
                    max_path_length=100,
                    epoch_length=steps_per_episode,
                    min_pool_size=10000,
                    n_epochs=num_episodes,
                    discount=0.99,
                    scale_reward=0.01,
                    qf_learning_rate=0.001,
                    policy_learning_rate=0.0001,
                    # Uncomment both lines (this and the plot parameter below) to enable plotting
                    # plot=True,
                )
                algo.train()
Exemplo n.º 8
0
def create_env(which_agent):

    # setup environment
    if (which_agent == 0):
        env = PointEnv()
        dt_from_xml = env.model.opt.timestep
        env = normalize(env)
    elif (which_agent == 1):
        env = AntEnv()
        dt_from_xml = env.model.opt.timestep
        env = normalize(env)
    elif (which_agent == 2):
        env = SwimmerEnv()
        dt_from_xml = env.model.opt.timestep
        env = normalize(SwimmerEnv())  #dt 0.001 and frameskip=150
    elif (which_agent == 3):
        env = ReacherEnv()
        dt_from_xml = env.model.opt.timestep
    elif (which_agent == 4):
        env = HalfCheetahEnv()
        dt_from_xml = env.model.opt.timestep
        env = normalize(env)


#     elif(which_agent==5):
#         env = RoachEnv() #this is a personal vrep env
#         dt_from_xml = env.VREP_DT
    elif (which_agent == 6):
        env = HopperEnv()
        dt_from_xml = env.model.opt.timestep
        env = normalize(env)
    elif (which_agent == 7):
        env = Walker2DEnv()
        dt_from_xml = env.model.opt.timestep
        env = normalize(env)

    #get dt value from env - DOES NOT WORK !!!!
    #     if(which_agent==5):
    #         dt_from_xml = env.VREP_DT
    #     else:
    #         dt_from_xml = env.model.opt.timestep
    print("\n\n the dt is: ", dt_from_xml, "\n\n")

    #set vars
    tf.set_random_seed(2)
    gym.logger.setLevel(gym.logging.WARNING)
    dimO = env.observation_space.shape
    dimA = env.action_space.shape
    print('--------------------------------- \nState space dimension: ', dimO)
    print('Action space dimension: ', dimA,
          "\n -----------------------------------")

    return env, dt_from_xml
Exemplo n.º 9
0
def experiment(variant):
    # env = NormalizedBoxEnv(MultiGoalEnv(
    #     actuation_cost_coeff=10,
    #     distance_cost_coeff=1,
    #     goal_reward=10,
    # ))
    env = NormalizedBoxEnv(HalfCheetahEnv())

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    # qf = ExpectableQF(
    # obs_dim=obs_dim,
    # action_dim=action_dim,
    # hidden_size=100,
    # )
    net_size = variant['net_size']
    qf = ConcatMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = ConcatMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    # TODO(vitchyr): just creating the plotter crashes EC2
    # plotter = QFPolicyPlotter(
    # qf=qf,
    # policy=policy,
    # obs_lst=np.array([[-2.5, 0.0],
    # [0.0, 0.0],
    # [2.5, 2.5]]),
    # default_action=[np.nan, np.nan],
    # n_samples=100
    # )
    algorithm = ExpectedSAC(
        env=env,
        policy=policy,
        qf=qf,
        vf=vf,
        # plotter=plotter,
        # render_eval_paths=True,
        **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
Exemplo n.º 10
0
def run_task(*_):
    """
    DPG on HalfCheetah environment
    """
    env = normalize(HalfCheetahEnv())
    """
    Initialise the policy as a neural network policy
    """
    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32))
    """
    Defining exploration strategy : OUStrategy - 
    """
    """
    This strategy implements the Ornstein-Uhlenbeck process, which adds
    time-correlated noise to the actions taken by the deterministic policy.
    The OU process satisfies the following stochastic differential equation:
    dxt = theta*(mu - xt)*dt + sigma*dWt
    where Wt denotes the Wiener process
    """
    es = OUStrategy(env_spec=env.spec)
    """
    Defining the Q network
    """
    qf = ContinuousMLPQFunction(env_spec=env.spec)
    """
    Persistence Length Exploration
    """
    lp = Persistence_Length_Exploration(env=env, qf=qf, policy=policy)

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        lp=lp,
        batch_size=32,
        max_path_length=1000,
        epoch_length=1000,
        min_pool_size=10000,
        n_epochs=15000,
        discount=0.99,
        scale_reward=0.01,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        #Uncomment both lines (this and the plot parameter below) to enable plotting
        plot=True,
    )

    algo.train()
Exemplo n.º 11
0
def run_task(variant):
    import tensorflow as tf
    from railrl.railrl.algos.ddpg import DDPG
    from railrl.policies.nn_policy import FeedForwardPolicy
    from railrl.qfunctions.nn_qfunction import FeedForwardCritic
    from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF
    from rllab.exploration_strategies.ou_strategy import OUStrategy
    from sandbox.rocky.tf.envs.base import TfEnv
    from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv

    env = TfEnv(HalfCheetahEnv())
    algo_name = variant['Algorithm']
    if algo_name == 'Quadratic-DDPG':
        qf = QuadraticNAF(
            name_or_scope="quadratic_qf",
            env_spec=env.spec,
        )
    elif algo_name == 'DDPG':
        qf = FeedForwardCritic(
            name_or_scope="critic",
            env_spec=env.spec,
            embedded_hidden_sizes=(100, ),
            observation_hidden_sizes=(100, ),
            hidden_nonlinearity=tf.nn.relu,
        )
    else:
        raise Exception('Algo name not recognized: {0}'.format(algo_name))

    es = OUStrategy(env_spec=env.spec)
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )

    ddpg_params = dict(
        batch_size=128,
        n_epochs=20,
        epoch_length=10000,
        eval_samples=10000,
        discount=0.99,
        policy_learning_rate=1e-4,
        qf_learning_rate=1e-3,
        soft_target_tau=0.01,
        replay_pool_size=1000000,
        min_pool_size=256,
        scale_reward=1.0,
        max_path_length=1000,
        qf_weight_decay=0.01,
    )
    algorithm = DDPG(env, es, policy, qf, **ddpg_params)
    algorithm.train()
def main():
    stub(globals())
    ddpg_params = dict(
        batch_size=64,
        n_epochs=2000,
        epoch_length=1000,
        eval_samples=1000,
        discount=0.99,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        soft_target_tau=0.001,
        replay_pool_size=1000000,
        min_pool_size=1000,
        scale_reward=0.1,
    )
    env = TfEnv(HalfCheetahEnv())
    es = OUStrategy(env_spec=env.spec)

    policy = DeterministicMLPPolicy(
        name="init_policy",
        env_spec=env.spec,
        hidden_sizes=(100, 100),
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=tf.nn.tanh,
    )
    qf = ContinuousMLPQFunction(
        name="qf",
        env_spec=env.spec,
        hidden_sizes=(100, 100),
        bn=False,
    )

    algorithm = DDPG(
        env,
        policy,
        qf,
        es,
        **ddpg_params
    )

    run_experiment_lite(
        algorithm.train(),
        n_parallel=1,
        snapshot_mode="last",
        exp_prefix="ddpg-shane-half-cheetah-script",
        seed=1,
        variant=ddpg_params,
    )
Exemplo n.º 13
0
    def _setup_world(self, filename):
        """
        Helper method for handling setup of the MuJoCo world.
        Args:
            filename: Path to XML file containing the world information.
        """
        self._world = []
        self._model = []

        # Initialize Mujoco worlds. If there's only one xml file, create a single world object,
        # otherwise create a different world for each condition.
        for i in range(self._hyperparams['conditions']):
            self._world.append(HalfCheetahEnv())
        # Initialize x0.
        self.x0 = []
        self._full_init_state = []
        # pdb.set_trace()
        for i in range(self._hyperparams['conditions']):
            self.x0.append(self._world[i].reset())
            self._full_init_state.append(self._world[i].get_full_state())
def get_env_settings(env_id="", normalize_env=True, gym_name="",
                     env_params=None):
    if env_params is None:
        env_params = {}

    if env_id == 'cart':
        env = CartpoleEnv()
        name = "Cartpole"
    elif env_id == 'cheetah':
        env = HalfCheetahEnv()
        name = "HalfCheetah"
    elif env_id == 'ant':
        env = AntEnv()
        name = "Ant"
    elif env_id == 'point':
        env = gym_env("OneDPoint-v0")
        name = "OneDPoint"
    elif env_id == 'reacher':
        env = gym_env("Reacher-v1")
        name = "Reacher"
    elif env_id == 'idp':
        env = InvertedDoublePendulumEnv()
        name = "InvertedDoublePendulum"
    elif env_id == 'ocm':
        env = OneCharMemory(**env_params)
        name = "OneCharMemory"
    elif env_id == 'gym':
        if gym_name == "":
            raise Exception("Must provide a gym name")
        env = gym_env(gym_name)
        name = gym_name
    else:
        raise Exception("Unknown env: {0}".format(env_id))
    if normalize_env:
        env = normalize(env)
        name += "-normalized"
    return dict(
        env=env,
        name=name,
        was_env_normalized=normalize_env,
    )
def example(*_):
    env = HalfCheetahEnv()
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    algorithm = DDPG(
        env,
        es,
        policy,
        qf,
        n_epochs=25,
        batch_size=1024,
        replay_pool_size=10000,
    )
    algorithm.train()
Exemplo n.º 16
0
def init(env_name, args):
    if env_name == 'SparseMountainCar':
        from rllab_env.sparse_mountain_car import SparseMountainCarEnv
        env = RLLabWrapper(SparseMountainCarEnv())
    elif env_name == 'Ant':
        from rllab_env.ant_env import AntEnv
        env = RLLabWrapper(AntEnv(args))
    elif env_name == 'AntGather':
        from rllab_env.ant_gather_env import AntGatherEnv
        env = RLLabWrapper(AntGatherEnv(args))
    elif env_name == 'HalfCheetah':
        from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv
        env = RLLabWrapper(HalfCheetahEnv())
    elif env_name == 'MountainCar':
        from rllab.envs.box2d.mountain_car_env import MountainCarEnv
        env = RLLabWrapper(MountainCarEnv())
    elif env_name == 'Cartpole':
        from rllab.envs.box2d.cartpole_env import CartpoleEnv
        env = RLLabWrapper(CartpoleEnv())
    elif env_name == 'SingleGoal':
        from mazebase import single_goal
        from mazebase_env import single_goal as config
        env = MazeBaseWrapper('SingleGoal', single_goal, config)
    elif env_name == 'sp_goal':
        from mazebase_env import sp_goal
        env = MazeBaseWrapper('sp_goal', sp_goal, sp_goal)
    elif env_name == 'sp_switch':
        from mazebase_env import sp_switch
        config = sp_switch.get_opts_with_args(args)
        sp_switch.get_opts = lambda: config
        env = MazeBaseWrapper('sp_switch', sp_switch, sp_switch)
    elif env_name == 'sp_pick':
        from mazebase_env import sp_pick
        env = MazeBaseWrapper('sp_pick', sp_pick, sp_pick)
    elif "MiniGrid" in env_name:
        env = MinigridWrapper(env_name)    
    else:
        raise RuntimeError("wrong env name")

    return env
Exemplo n.º 17
0
def create_env(which_agent):

    # setup environment
    if (which_agent == 0):
        env = normalize(PointEnv())
    elif (which_agent == 1):
        env = normalize(AntEnv())
    elif (which_agent == 2):
        env = normalize(SwimmerEnv())  #dt 0.001 and frameskip=150
    elif (which_agent == 3):
        env = gym.make("modified_gym_env:ReacherPyBulletEnv-v1")
    elif (which_agent == 4):
        env = normalize(HalfCheetahEnv())
    elif (which_agent == 5):
        env = RoachEnv()  #this is a personal vrep env
    elif (which_agent == 6):
        env = normalize(HopperEnv())
    elif (which_agent == 7):
        env = normalize(Walker2DEnv())

    #get dt value from env
    if (which_agent == 5):
        dt_from_xml = env.VREP_DT
    elif (which_agent == 3):
        dt_from_xml = 0.02
    else:
        dt_from_xml = env.model.opt.timestep
    print("\n\n the dt is: ", dt_from_xml, "\n\n")

    #set vars
    tf.set_random_seed(2)
    gym.logger.setLevel(logging.WARN)
    dimO = env.observation_space.shape
    dimA = env.action_space.shape
    print('--------------------------------- \nState space dimension: ', dimO)
    print('Action space dimension: ', dimA,
          "\n -----------------------------------")

    return env, dt_from_xml
def run_task(_):
    for seed in range(3):
        env = TfEnv(HalfCheetahEnv())
        es = OUStrategy(env_spec=env.spec)
        qf = FeedForwardCritic(
            name_or_scope="critic",
            env_spec=env.spec,
        )
        policy = FeedForwardPolicy(
            name_or_scope="actor",
            env_spec=env.spec,
        )
        ddpg_params = dict(
            batch_size=16,
            n_epochs=100,
            epoch_length=100,
            eval_samples=100,
            max_path_length=10,
            min_pool_size=2,
        )
        algorithm = DDPG(env, es, policy, qf, **ddpg_params)

        algorithm.train(),
def main():
    stub(globals())
    env = TfEnv(HalfCheetahEnv())
    ddpg_params = dict(
        batch_size=128,
        n_epochs=50,
        epoch_length=10000,
        eval_samples=10000,
        discount=0.99,
        policy_learning_rate=1e-4,
        qf_learning_rate=1e-3,
        soft_target_tau=0.01,
        replay_pool_size=1000000,
        min_pool_size=256,
        scale_reward=1.0,
        max_path_length=1000,
        qf_weight_decay=0.01,
    )
    es = OUStrategy(env_spec=env.spec)
    qf = QuadraticNAF(
        name_or_scope="quadratic_qf",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    algorithm = DDPG(env, es, policy, qf, **ddpg_params)

    env.reset()
    run_experiment_lite(
        algorithm.train(),
        n_parallel=1,
        snapshot_mode="last",
        exp_prefix="test-qddpg-cheetah",
        seed=1,
    )
Exemplo n.º 20
0
from rllab.algos.ddpg import DDPG
from rllab.envs.normalized_env import normalize
from rllab.misc.instrument import run_experiment_lite
from rllab.exploration_strategies.ou_strategy import OUStrategy
from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy
from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction

from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv

env = normalize(HalfCheetahEnv())


def run_task(*_):
    """
    DPG on Hopper environment
    """
    env = normalize(HalfCheetahEnv())
    """
    Initialise the policy as a neural network policy
    """
    # policy = DeterministicMLPPolicy(
    #     env_spec=env.spec,
    #     # The neural network policy should have two hidden layers, each with 32 hidden units.
    #     hidden_sizes=(32, 32)
    # )

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(400, 300))
Exemplo n.º 21
0
def get(perm):
    name = perm["problem"]
    if name.lower() == "cartpole":
        from rllab.envs.box2d.cartpole_env import CartpoleEnv
        return normalize(CartpoleEnv())

    elif name.lower() == "mountain car height bonus":
        from rllab.envs.box2d.mountain_car_env import MountainCarEnv
        return normalize(MountainCarEnv())

    elif name.lower() == "mountain car":
        from rllab.envs.box2d.mountain_car_env import MountainCarEnv
        return normalize(MountainCarEnv(height_bonus=0))

    elif name.lower() == "gym mountain car":
        from rllab.envs.gym_env import GymEnv
        return normalize(GymEnv("MountainCarContinuous-v0",
                                record_video=False))

    elif name.lower() == "pendulum":
        from rllab.envs.gym_env import GymEnv
        return normalize(GymEnv("Pendulum-v0", record_video=False))

    elif name.lower() == "mujoco double pendulum":
        from rllab.envs.mujoco.inverted_double_pendulum_env import InvertedDoublePendulumEnv
        return normalize(InvertedDoublePendulumEnv())

    elif name.lower() == "double pendulum":
        from rllab.envs.box2d.double_pendulum_env import DoublePendulumEnv
        return normalize(DoublePendulumEnv())

    elif name.lower() == "hopper":
        from rllab.envs.mujoco.hopper_env import HopperEnv
        return normalize(HopperEnv())

    elif name.lower() == "swimmer":
        from rllab.envs.mujoco.swimmer_env import SwimmerEnv
        return normalize(SwimmerEnv())

    elif name.lower() == "2d walker":
        from rllab.envs.mujoco.walker2d_env import Walker2DEnv
        return normalize(Walker2DEnv())

    elif name.lower() == "half cheetah":
        from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv
        return normalize(HalfCheetahEnv())

    elif name.lower() == "ant":
        from rllab.envs.mujoco.ant_env import AntEnv
        return normalize(AntEnv())

    elif name.lower() == "simple humanoid":
        from rllab.envs.mujoco.simple_humanoid_env import SimpleHumanoidEnv
        return normalize(SimpleHumanoidEnv())

    elif name.lower() == "full humanoid":
        from rllab.envs.mujoco.humanoid_env import HumanoidEnv
        return normalize(HumanoidEnv())

    else:
        raise NotImplementedError(f"Environment {name} unknown")
Exemplo n.º 22
0
def run_task(*_):
    """
    DPG on Hopper environment
    """
    env = normalize(HalfCheetahEnv())
    """
    Initialise the policy as a neural network policy
    """
    # policy = DeterministicMLPPolicy(
    #     env_spec=env.spec,
    #     # The neural network policy should have two hidden layers, each with 32 hidden units.
    #     hidden_sizes=(32, 32)
    # )

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(400, 300))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)
    """
    Using the DDPG algorithm
    """
    # algo = DDPG(
    #     env=env,
    #     policy=policy,
    #     es=es,
    #     qf=qf,
    #     batch_size=32,
    #     max_path_length=500,
    #     epoch_length=500,
    #     min_pool_size=10000,
    #     n_epochs=20000,
    #     discount=0.99,
    #     scale_reward=0.01,
    #     qf_learning_rate=1e-3,
    #     policy_learning_rate=1e-4,
    #     #Uncomment both lines (this and the plot parameter below) to enable plotting
    #     plot=True,
    # )

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=64,
        max_path_length=1000,
        epoch_length=1000,
        min_pool_size=10000,
        n_epochs=20000,
        discount=0.99,
        scale_reward=0.01,
        qf_learning_rate=10e-3,
        policy_learning_rate=10e-4,
        #Uncomment both lines (this and the plot parameter below) to enable plotting
        plot=True,
    )

    algo.train()