def run_task(args, *_):

    #env = TfEnv(normalize(dnc_envs.create_stochastic('pick'))) # Cannot be solved easily by TRPO
    #env = TfEnv(normalize(CartpoleEnv()))
    env = TfEnv(CartpoleEnv())
    #metaworld_env = ML1.get_train_tasks("pick-place-v1")
    #tasks = metaworld_env.sample_tasks(1)
    #metaworld_env.set_task(tasks[0])
    #metaworld_env._observation_space = convert_gym_space(metaworld_env.observation_space)
    #metaworld_env._action_space = convert_gym_space(metaworld_env.action_space)
    #env = TfEnv(normalize(metaworld_env))

    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        min_std=1e-2,
        hidden_sizes=(150, 100, 50),
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=20000,
        # batch_size=100,
        force_batch_sampler=True,
        max_path_length=50,
        discount=1,
        step_size=0.02,
    )

    algo.train()
def main():
    env = TfEnv(CartpoleEnv())
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    default_ddpg_params = dict(
        batch_size=128,
        n_epochs=10,
        epoch_length=1000,
        eval_samples=1000,
        max_path_length=100,
        min_pool_size=100,
    )
    exp_prefix = 'ddpg-cartpole-speed-{0}'.format(timestamp())
    algorithm = DDPG(
        env,
        es,
        policy,
        qf,
        **default_ddpg_params,
    )

    run_experiment_lite(
        algorithm.train(),
        n_parallel=1,
        snapshot_mode="last",
        exp_prefix=exp_prefix,
        seed=1,
    )
Exemplo n.º 3
0
def example(variant):
    env = CartpoleEnv()
    env = NormalizedBoxEnv(env)
    es = OUStrategy(action_space=env.action_space)
    qf = FeedForwardQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        **variant['qf_params'],
    )
    policy = FeedForwardPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        400,
        300,
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(
        env,
        qf,
        policy,
        exploration_policy,
        **variant['algo_params']
    )
    algorithm.to(ptu.device)
    algorithm.train()
def run_task(variant):
    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
    from sandbox.rocky.tf.algos.vpg import VPG
    from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
    from rllab.envs.box2d.cartpole_env import CartpoleEnv
    from sandbox.rocky.tf.envs.base import TfEnv

    env_name = variant['Environment']
    if env_name == 'Cartpole':
        env = TfEnv(CartpoleEnv())
    policy = GaussianMLPPolicy(name="policy",
                               env_spec=env.spec,
                               hidden_sizes=(100, 100))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algorithm = VPG(
        env=env,
        policy=policy,
        baseline=baseline,
        n_itr=100,
        start_itr=0,
        batch_size=1000,
        max_path_length=1000,
        discount=0.99,
    )
    algorithm.train()
Exemplo n.º 5
0
def run_task(*_):
    env = normalize(CartpoleEnv())

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=32,
        max_path_length=100,
        epoch_length=1000,
        min_pool_size=10000,
        n_epochs=1000,
        discount=0.99,
        scale_reward=0.01,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
Exemplo n.º 6
0
def test_baseline(baseline_cls):
    env = CartpoleEnv()
    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(6,))
    baseline = baseline_cls(env_spec=env.spec)
    algo = VPG(
        env=env, policy=policy, baseline=baseline,
        n_itr=1, batch_size=1000, max_path_length=100
    )
    algo.train()
 def setUp(self):
     super().setUp()
     self.env = TfEnv(CartpoleEnv())
     self.es = OUStrategy(env_spec=self.env.spec)
     self.sum_policy = SumPolicy(name_or_scope='policies',
                                 observation_dim=4,
                                 action_dim=1)
     self.sum_critic = SumCritic(name_or_scope='qf',
                                 observation_dim=4,
                                 action_dim=1)
Exemplo n.º 8
0
 def test_issue_3():
     """
     As reported in https://github.com/rllab/rllab/issues/3, the adaptive_std parameter was not functioning properly
     """
     env = CartpoleEnv()
     policy = GaussianMLPPolicy(env_spec=env, adaptive_std=True)
     baseline = ZeroBaseline(env_spec=env.spec)
     algo = TRPO(env=env,
                 policy=policy,
                 baseline=baseline,
                 batch_size=100,
                 n_itr=1)
     algo.train()
def run_task(variant):
    import tensorflow as tf
    from railrl.railrl.algos.ddpg import DDPG
    from railrl.policies.nn_policy import FeedForwardPolicy
    from railrl.qfunctions.nn_qfunction import FeedForwardCritic
    from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF
    from rllab.exploration_strategies.ou_strategy import OUStrategy
    from sandbox.rocky.tf.envs.base import TfEnv
    from rllab.envs.box2d.cartpole_env import CartpoleEnv

    env = TfEnv(CartpoleEnv())
    algo_name = variant['Algorithm']
    if algo_name == 'Quadratic-DDPG':
        qf = QuadraticNAF(
            name_or_scope="quadratic_qf",
            env_spec=env.spec,
        )
    elif algo_name == 'DDPG':
        qf = FeedForwardCritic(
            name_or_scope="critic",
            env_spec=env.spec,
            embedded_hidden_sizes=(100, ),
            observation_hidden_sizes=(100, ),
            hidden_nonlinearity=tf.nn.relu,
        )
    else:
        raise Exception('Algo name not recognized: {0}'.format(algo_name))

    es = OUStrategy(env_spec=env.spec)
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )

    ddpg_params = dict(
        batch_size=128,
        n_epochs=100,
        epoch_length=1000,
        eval_samples=1000,
        discount=0.99,
        policy_learning_rate=1e-4,
        qf_learning_rate=1e-3,
        soft_target_tau=0.01,
        replay_pool_size=1000000,
        min_pool_size=256,
        scale_reward=1.0,
        max_path_length=1000,
        qf_weight_decay=0.01,
    )
    algorithm = DDPG(env, es, policy, qf, **ddpg_params)
    algorithm.train()
Exemplo n.º 10
0
def run_task(*_):
    """
    DPG on Swimmer environment
    """
    env = normalize(CartpoleEnv())
    """
    Initialise the policy as a neural network policy
    """
    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32))
    """
    Defining exploration strategy : OUStrategy - 
    """
    """
    This strategy implements the Ornstein-Uhlenbeck process, which adds
    time-correlated noise to the actions taken by the deterministic policy.
    The OU process satisfies the following stochastic differential equation:
    dxt = theta*(mu - xt)*dt + sigma*dWt
    where Wt denotes the Wiener process
    """
    es = OUStrategy(env_spec=env.spec)
    """
    Defining the Q network
    """
    qf = ContinuousMLPQFunction(env_spec=env.spec)
    """
    Using the DDPG algorithm
    """
    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=32,
        max_path_length=100,
        epoch_length=1000,
        min_pool_size=10000,
        n_epochs=100,
        discount=0.99,
        scale_reward=0.01,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        #Uncomment both lines (this and the plot parameter below) to enable plotting
        plot=True,
    )
    """
    Training the networks based on the DDPG algorithm
    """
    algo.train()
Exemplo n.º 11
0
def test_ddpg():
    env = CartpoleEnv()
    policy = DeterministicMLPPolicy(env.spec)
    qf = ContinuousMLPQFunction(env.spec)
    es = OUStrategy(env.spec)
    algo = DDPG(
        env=env, policy=policy, qf=qf, es=es,
        n_epochs=1,
        epoch_length=100,
        batch_size=32,
        min_pool_size=50,
        replay_pool_size=1000,
        eval_samples=100,
    )
    algo.train()
Exemplo n.º 12
0
def run_task(v):

    print("_________________________________")
    print("#################################")
    print("_________________________________")
    print("_________________________________")
    print("#################################")
    print("###    agents_number : " + str(agents_number) + "    ####")
    print("###                          ####")
    print("### participation_rate : " + str(participation_rate) + " ####")
    print("###                          ####")
    print("###    average_period : " + str(average_period) + "   ####")
    print("###                          ####")
    print("### quantization_tuning : " + str(quantization_tuning) + " ####")
    print("###                          ####")
    print("###     discount : " + str(discount) + "      ####")
    print("#################################")
    print("_________________________________")
    print("_________________________________")
    print("#################################")
    print("_________________________________")

    env = normalize(CartpoleEnv())

    policy = GaussianMLPPolicy(env_spec=env.spec)

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = Server(
        participation_rate=participation_rate,
        agents_number=agents_number,
        average_period=average_period,
        env=env,
        policy=policy,
        baseline=baseline,
        difference_params=True,
        quantize=True,
        quantization_tuning=quantization_tuning,
        batch_size=400,
        max_path_length=100,
        n_itr=50,
        discount=discount,
        step_size=0.01,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )

    algo.train()
Exemplo n.º 13
0
def run_task(*_):
    env = normalize(CartpoleEnv())

    policy = GaussianGRUPolicy(env_spec=env.spec, )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=4000,
                max_path_length=100,
                n_itr=10,
                discount=0.99,
                step_size=0.01,
                optimizer=ConjugateGradientOptimizer(
                    hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)))
    algo.train()
def main():
    env = TfEnv(CartpoleEnv())
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    default_ddpg_params = dict(
        batch_size=32,
        n_epochs=10,
        epoch_length=1000,
        eval_samples=1000,
        max_path_length=100,
        min_pool_size=1000,
    )
    sweeper = DeterministicHyperparameterSweeper(
        {'scale_reward': [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]}, )
    exp_prefix = 'ddpg-cart-reward-scale-sweep-{0}'.format(timestamp())
    for ddpg_params in sweeper.iterate_hyperparameters():
        algorithm = DDPG(
            env,
            es,
            policy,
            qf,
            scale_reward=ddpg_params['scale_reward'],
            **default_ddpg_params,
        )

        for seed in range(3):
            run_experiment_lite(
                algorithm.train(),
                n_parallel=1,
                snapshot_mode="last",
                exp_prefix=exp_prefix,
                seed=seed,
                # mode="local",
                # use_cloudpickle=True,
            )
def get_env_settings(env_id="", normalize_env=True, gym_name="",
                     env_params=None):
    if env_params is None:
        env_params = {}

    if env_id == 'cart':
        env = CartpoleEnv()
        name = "Cartpole"
    elif env_id == 'cheetah':
        env = HalfCheetahEnv()
        name = "HalfCheetah"
    elif env_id == 'ant':
        env = AntEnv()
        name = "Ant"
    elif env_id == 'point':
        env = gym_env("OneDPoint-v0")
        name = "OneDPoint"
    elif env_id == 'reacher':
        env = gym_env("Reacher-v1")
        name = "Reacher"
    elif env_id == 'idp':
        env = InvertedDoublePendulumEnv()
        name = "InvertedDoublePendulum"
    elif env_id == 'ocm':
        env = OneCharMemory(**env_params)
        name = "OneCharMemory"
    elif env_id == 'gym':
        if gym_name == "":
            raise Exception("Must provide a gym name")
        env = gym_env(gym_name)
        name = gym_name
    else:
        raise Exception("Unknown env: {0}".format(env_id))
    if normalize_env:
        env = normalize(env)
        name += "-normalized"
    return dict(
        env=env,
        name=name,
        was_env_normalized=normalize_env,
    )
Exemplo n.º 16
0
def init(env_name, args):
    if env_name == 'SparseMountainCar':
        from rllab_env.sparse_mountain_car import SparseMountainCarEnv
        env = RLLabWrapper(SparseMountainCarEnv())
    elif env_name == 'Ant':
        from rllab_env.ant_env import AntEnv
        env = RLLabWrapper(AntEnv(args))
    elif env_name == 'AntGather':
        from rllab_env.ant_gather_env import AntGatherEnv
        env = RLLabWrapper(AntGatherEnv(args))
    elif env_name == 'HalfCheetah':
        from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv
        env = RLLabWrapper(HalfCheetahEnv())
    elif env_name == 'MountainCar':
        from rllab.envs.box2d.mountain_car_env import MountainCarEnv
        env = RLLabWrapper(MountainCarEnv())
    elif env_name == 'Cartpole':
        from rllab.envs.box2d.cartpole_env import CartpoleEnv
        env = RLLabWrapper(CartpoleEnv())
    elif env_name == 'SingleGoal':
        from mazebase import single_goal
        from mazebase_env import single_goal as config
        env = MazeBaseWrapper('SingleGoal', single_goal, config)
    elif env_name == 'sp_goal':
        from mazebase_env import sp_goal
        env = MazeBaseWrapper('sp_goal', sp_goal, sp_goal)
    elif env_name == 'sp_switch':
        from mazebase_env import sp_switch
        config = sp_switch.get_opts_with_args(args)
        sp_switch.get_opts = lambda: config
        env = MazeBaseWrapper('sp_switch', sp_switch, sp_switch)
    elif env_name == 'sp_pick':
        from mazebase_env import sp_pick
        env = MazeBaseWrapper('sp_pick', sp_pick, sp_pick)
    elif "MiniGrid" in env_name:
        env = MinigridWrapper(env_name)    
    else:
        raise RuntimeError("wrong env name")

    return env
Exemplo n.º 17
0
def run_task(*_):
    env = normalize(CartpoleEnv())

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=1000,
        discount=0.99,
        step_size=0.01,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        plot=True)
    algo.train()
Exemplo n.º 18
0
def main():
    stub(globals())
    env = TfEnv(CartpoleEnv())
    ddpg_params = dict(
        batch_size=128,
        n_epochs=50,
        epoch_length=1000,
        eval_samples=1000,
        discount=0.99,
        policy_learning_rate=1e-4,
        qf_learning_rate=1e-3,
        soft_target_tau=0.01,
        replay_pool_size=1000000,
        min_pool_size=256,
        scale_reward=1.0,
        max_path_length=1000,
        qf_weight_decay=0.01,
    )
    es = OUStrategy(env_spec=env.spec)
    qf = QuadraticNAF(
        name_or_scope="quadratic_qf",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    algorithm = DDPG(env, es, policy, qf, **ddpg_params)

    for seed in range(3):
        env.reset()
        run_experiment_lite(
            algorithm.train(),
            n_parallel=1,
            snapshot_mode="last",
            exp_prefix="test-qddpg-cartpole",
            seed=seed,
        )
Exemplo n.º 19
0
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
from rllab.envs.normalized_env import normalize
import numpy as np
import theano
import theano.tensor as TT
from rllab.sampler import parallel_sampler
from lasagne.updates import sgd
from rllab.misc import ext
from lasagne.updates import adam

import matplotlib.pyplot as plt

load_policy = True
# normalize() makes sure that the actions for the environment lies
# within the range [-1, 1] (only works for environments with continuous actions)
env = normalize(CartpoleEnv())
# Initialize a neural network policy with a single hidden layer of 8 hidden units
policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8, ))
parallel_sampler.populate_task(env, policy)

# policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing
# distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute
# the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class
# rllab.distributions.DiagonalGaussian
dist = policy.distribution
# We will collect 100 trajectories per iteration
N = 10
# Each trajectory will have at most 100 time steps
T = 100
# Number of iterations
n_itr = 1000
Exemplo n.º 20
0
from __future__ import print_function
from __future__ import absolute_import

from sandbox.rocky.tf.algos.trpo import TRPO
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab.envs.box2d.cartpole_env import CartpoleEnv
from rllab.envs.normalized_env import normalize
from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
from sandbox.rocky.tf.envs.base import TfEnv
from rllab.misc.instrument import stub, run_experiment_lite

stub(globals())

env = TfEnv(normalize(CartpoleEnv()))

policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32))

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=100,
    n_itr=40,
    discount=0.99,
Exemplo n.º 21
0
# misc params
parser.add_argument("--debug", type=int, default=0)
parser.add_argument("--seed", type=int, default=456)
parser.add_argument("--expert_data_path",
                    type=str,
                    default="expert_trajs/racing/Racing-State-0")

args = parser.parse_args()

if __name__ == "__main__":
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    #env = TfEnv(normalize(CartpoleEnv())) ## normalize or not ?
    if args.environment == "CartPole":
        env = TfEnv(CartpoleEnv())
    elif args.environment == "Pendulum":
        env = gym.make("Pendulum-v0")
        env = TfEnv(env)
        #t_hidden_sizes = ()
    elif args.environment == "NoisyPendulum":
        gym.envs.register(
            id="NoisyPendulum-v0",
            entry_point='rllab.envs.target_env:NoisyPendulum',
            timestep_limit=999,
            reward_threshold=195.0,
        )
        env = TfEnv(GymEnv("NoisyPendulum-v0"))
    elif args.environment in ["Racing-State", "Racing-State-Action"]:
        #env = TfEnv(CarRacing(mode="pixels"))
        if args.environment == "Racing-State":
Exemplo n.º 22
0
def get(perm):
    name = perm["problem"]
    if name.lower() == "cartpole":
        from rllab.envs.box2d.cartpole_env import CartpoleEnv
        return normalize(CartpoleEnv())

    elif name.lower() == "mountain car height bonus":
        from rllab.envs.box2d.mountain_car_env import MountainCarEnv
        return normalize(MountainCarEnv())

    elif name.lower() == "mountain car":
        from rllab.envs.box2d.mountain_car_env import MountainCarEnv
        return normalize(MountainCarEnv(height_bonus=0))

    elif name.lower() == "gym mountain car":
        from rllab.envs.gym_env import GymEnv
        return normalize(GymEnv("MountainCarContinuous-v0",
                                record_video=False))

    elif name.lower() == "pendulum":
        from rllab.envs.gym_env import GymEnv
        return normalize(GymEnv("Pendulum-v0", record_video=False))

    elif name.lower() == "mujoco double pendulum":
        from rllab.envs.mujoco.inverted_double_pendulum_env import InvertedDoublePendulumEnv
        return normalize(InvertedDoublePendulumEnv())

    elif name.lower() == "double pendulum":
        from rllab.envs.box2d.double_pendulum_env import DoublePendulumEnv
        return normalize(DoublePendulumEnv())

    elif name.lower() == "hopper":
        from rllab.envs.mujoco.hopper_env import HopperEnv
        return normalize(HopperEnv())

    elif name.lower() == "swimmer":
        from rllab.envs.mujoco.swimmer_env import SwimmerEnv
        return normalize(SwimmerEnv())

    elif name.lower() == "2d walker":
        from rllab.envs.mujoco.walker2d_env import Walker2DEnv
        return normalize(Walker2DEnv())

    elif name.lower() == "half cheetah":
        from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv
        return normalize(HalfCheetahEnv())

    elif name.lower() == "ant":
        from rllab.envs.mujoco.ant_env import AntEnv
        return normalize(AntEnv())

    elif name.lower() == "simple humanoid":
        from rllab.envs.mujoco.simple_humanoid_env import SimpleHumanoidEnv
        return normalize(SimpleHumanoidEnv())

    elif name.lower() == "full humanoid":
        from rllab.envs.mujoco.humanoid_env import HumanoidEnv
        return normalize(HumanoidEnv())

    else:
        raise NotImplementedError(f"Environment {name} unknown")
Exemplo n.º 23
0
        SimpleHumanoidEnv,
        InvertedDoublePendulumEnv,
        HopperEnv,
        HalfCheetahEnv,
        PointGatherEnv,
        SwimmerGatherEnv,
        AntGatherEnv,
        PointMazeEnv,
        SwimmerMazeEnv,
        AntMazeEnv,
    ])

envs = [cls() for cls in simple_env_classes]
envs.append(ProxyEnv(envs[0]))
envs.append(IdentificationEnv(CartpoleEnv, {}))
envs.append(NoisyObservationEnv(CartpoleEnv()))
envs.append(DelayedActionEnv(CartpoleEnv()))
envs.append(NormalizedEnv(CartpoleEnv()))
envs.append(GymEnv('CartPole-v0'))


@tools.params(*envs)
def test_env(env):
    print("Testing", env.__class__)
    ob_space = env.observation_space
    act_space = env.action_space
    ob = env.reset()
    assert ob_space.contains(ob)
    a = act_space.sample()
    assert act_space.contains(a)
    res = env.step(a)
 def setUp(self):
     super().setUp()
     self.env = TfEnv(CartpoleEnv())
     self.es = OUStrategy(env_spec=self.env.spec)
Exemplo n.º 25
0
 def __init__(self, num_steps=100, position_only=True):
     assert position_only, "I only added position_only due to some weird " \
                           "serialization bug"
     CartpoleEnv.__init__(self, position_only=position_only)
     self.num_steps = num_steps
Exemplo n.º 26
0
parser.add_argument("--batch_size",type=int,default=40 * 200)
parser.add_argument("--environment",type=str,default="Racing-State-Action")

parser.add_argument("--normalize",type=int,default=0)
parser.add_argument("--recurrent",type=int,default=0)

# Network Params
parser.add_argument("--hidden_sizes",type=int,nargs="+",default=[32,32,32,16])
parser.add_argument("--nonlinearity",type=str,default="tanh")

args = parser.parse_args()

#env = TfEnv(normalize(CartpoleEnv())) ## normalize or not ?
nonlin = {"relu":tf.nn.relu,"tanh":tf.nn.tanh,"elu":tf.nn.elu}[args.nonlinearity]
if args.environment == "CartPole":
    env = CartpoleEnv()
elif args.environment == "Pendulum":
    env = gym.make("Pendulum-v0")
elif args.environment == "Racing-State":
    env = CarRacing(mode='state',features=args.features)
elif args.environment == "Racing-State-Action":
    env = CarRacing(mode='state_action',features=args.features)
env = TfEnv(env)

if args.normalize:
    assert False

if args.recurrent:
    feat_net = MLP("feat_net", env.observation_space.shape, args.hidden_sizes[-1], args.hidden_sizes[:-1], nonlin, nonlin)
    policy = GaussianGRUPolicy("policy", env_spec=env.spec, hidden_dim=32,
                              feature_network=feat_net,
Exemplo n.º 27
0


##############################################################



if __name__ == '__main__':
	Transition = collections.namedtuple('Transition',
							('state', 'action', 'reward'))

	experiments = 5
	ALL_REWARDS = []
	for i in range(experiments):
		REWARDS = []

		from rllab.envs.box2d.cartpole_env import CartpoleEnv
		env = Rllab2GymWrapper(CartpoleEnv())

		# set_all_seeds(0)

		# N = batch size, B = mini batch size, m = sub iteration
		agent = Agent(4, 1, N = 10, B = 5, m = 2)
		agent.train(episodes = int(2000/20), horizon = 100, max_reward = 900)


		ALL_REWARDS.append(REWARDS)

	ALL_REWARDS = np.mean(np.array(ALL_REWARDS), axis = 0)
	np.savetxt("cartpole-spider-policy-2000t-mean5.csv", np.transpose( np.array(ALL_REWARDS) ), delimiter = ',')