def run_task(args, *_): #env = TfEnv(normalize(dnc_envs.create_stochastic('pick'))) # Cannot be solved easily by TRPO #env = TfEnv(normalize(CartpoleEnv())) env = TfEnv(CartpoleEnv()) #metaworld_env = ML1.get_train_tasks("pick-place-v1") #tasks = metaworld_env.sample_tasks(1) #metaworld_env.set_task(tasks[0]) #metaworld_env._observation_space = convert_gym_space(metaworld_env.observation_space) #metaworld_env._action_space = convert_gym_space(metaworld_env.action_space) #env = TfEnv(normalize(metaworld_env)) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, min_std=1e-2, hidden_sizes=(150, 100, 50), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=20000, # batch_size=100, force_batch_sampler=True, max_path_length=50, discount=1, step_size=0.02, ) algo.train()
def main(): env = TfEnv(CartpoleEnv()) es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) default_ddpg_params = dict( batch_size=128, n_epochs=10, epoch_length=1000, eval_samples=1000, max_path_length=100, min_pool_size=100, ) exp_prefix = 'ddpg-cartpole-speed-{0}'.format(timestamp()) algorithm = DDPG( env, es, policy, qf, **default_ddpg_params, ) run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix=exp_prefix, seed=1, )
def example(variant): env = CartpoleEnv() env = NormalizedBoxEnv(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), **variant['qf_params'], ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf, policy, exploration_policy, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def run_task(variant): from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from sandbox.rocky.tf.algos.vpg import VPG from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.envs.box2d.cartpole_env import CartpoleEnv from sandbox.rocky.tf.envs.base import TfEnv env_name = variant['Environment'] if env_name == 'Cartpole': env = TfEnv(CartpoleEnv()) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(100, 100)) baseline = LinearFeatureBaseline(env_spec=env.spec) algorithm = VPG( env=env, policy=policy, baseline=baseline, n_itr=100, start_itr=0, batch_size=1000, max_path_length=1000, discount=0.99, ) algorithm.train()
def run_task(*_): env = normalize(CartpoleEnv()) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=1000, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def test_baseline(baseline_cls): env = CartpoleEnv() policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(6,)) baseline = baseline_cls(env_spec=env.spec) algo = VPG( env=env, policy=policy, baseline=baseline, n_itr=1, batch_size=1000, max_path_length=100 ) algo.train()
def setUp(self): super().setUp() self.env = TfEnv(CartpoleEnv()) self.es = OUStrategy(env_spec=self.env.spec) self.sum_policy = SumPolicy(name_or_scope='policies', observation_dim=4, action_dim=1) self.sum_critic = SumCritic(name_or_scope='qf', observation_dim=4, action_dim=1)
def test_issue_3(): """ As reported in https://github.com/rllab/rllab/issues/3, the adaptive_std parameter was not functioning properly """ env = CartpoleEnv() policy = GaussianMLPPolicy(env_spec=env, adaptive_std=True) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=100, n_itr=1) algo.train()
def run_task(variant): import tensorflow as tf from railrl.railrl.algos.ddpg import DDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.nn_qfunction import FeedForwardCritic from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF from rllab.exploration_strategies.ou_strategy import OUStrategy from sandbox.rocky.tf.envs.base import TfEnv from rllab.envs.box2d.cartpole_env import CartpoleEnv env = TfEnv(CartpoleEnv()) algo_name = variant['Algorithm'] if algo_name == 'Quadratic-DDPG': qf = QuadraticNAF( name_or_scope="quadratic_qf", env_spec=env.spec, ) elif algo_name == 'DDPG': qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, embedded_hidden_sizes=(100, ), observation_hidden_sizes=(100, ), hidden_nonlinearity=tf.nn.relu, ) else: raise Exception('Algo name not recognized: {0}'.format(algo_name)) es = OUStrategy(env_spec=env.spec) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) ddpg_params = dict( batch_size=128, n_epochs=100, epoch_length=1000, eval_samples=1000, discount=0.99, policy_learning_rate=1e-4, qf_learning_rate=1e-3, soft_target_tau=0.01, replay_pool_size=1000000, min_pool_size=256, scale_reward=1.0, max_path_length=1000, qf_weight_decay=0.01, ) algorithm = DDPG(env, es, policy, qf, **ddpg_params) algorithm.train()
def run_task(*_): """ DPG on Swimmer environment """ env = normalize(CartpoleEnv()) """ Initialise the policy as a neural network policy """ policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) """ Defining exploration strategy : OUStrategy - """ """ This strategy implements the Ornstein-Uhlenbeck process, which adds time-correlated noise to the actions taken by the deterministic policy. The OU process satisfies the following stochastic differential equation: dxt = theta*(mu - xt)*dt + sigma*dWt where Wt denotes the Wiener process """ es = OUStrategy(env_spec=env.spec) """ Defining the Q network """ qf = ContinuousMLPQFunction(env_spec=env.spec) """ Using the DDPG algorithm """ algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=100, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, #Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, ) """ Training the networks based on the DDPG algorithm """ algo.train()
def test_ddpg(): env = CartpoleEnv() policy = DeterministicMLPPolicy(env.spec) qf = ContinuousMLPQFunction(env.spec) es = OUStrategy(env.spec) algo = DDPG( env=env, policy=policy, qf=qf, es=es, n_epochs=1, epoch_length=100, batch_size=32, min_pool_size=50, replay_pool_size=1000, eval_samples=100, ) algo.train()
def run_task(v): print("_________________________________") print("#################################") print("_________________________________") print("_________________________________") print("#################################") print("### agents_number : " + str(agents_number) + " ####") print("### ####") print("### participation_rate : " + str(participation_rate) + " ####") print("### ####") print("### average_period : " + str(average_period) + " ####") print("### ####") print("### quantization_tuning : " + str(quantization_tuning) + " ####") print("### ####") print("### discount : " + str(discount) + " ####") print("#################################") print("_________________________________") print("_________________________________") print("#################################") print("_________________________________") env = normalize(CartpoleEnv()) policy = GaussianMLPPolicy(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = Server( participation_rate=participation_rate, agents_number=agents_number, average_period=average_period, env=env, policy=policy, baseline=baseline, difference_params=True, quantize=True, quantization_tuning=quantization_tuning, batch_size=400, max_path_length=100, n_itr=50, discount=discount, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): env = normalize(CartpoleEnv()) policy = GaussianGRUPolicy(env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=10, discount=0.99, step_size=0.01, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))) algo.train()
def main(): env = TfEnv(CartpoleEnv()) es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) default_ddpg_params = dict( batch_size=32, n_epochs=10, epoch_length=1000, eval_samples=1000, max_path_length=100, min_pool_size=1000, ) sweeper = DeterministicHyperparameterSweeper( {'scale_reward': [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]}, ) exp_prefix = 'ddpg-cart-reward-scale-sweep-{0}'.format(timestamp()) for ddpg_params in sweeper.iterate_hyperparameters(): algorithm = DDPG( env, es, policy, qf, scale_reward=ddpg_params['scale_reward'], **default_ddpg_params, ) for seed in range(3): run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix=exp_prefix, seed=seed, # mode="local", # use_cloudpickle=True, )
def get_env_settings(env_id="", normalize_env=True, gym_name="", env_params=None): if env_params is None: env_params = {} if env_id == 'cart': env = CartpoleEnv() name = "Cartpole" elif env_id == 'cheetah': env = HalfCheetahEnv() name = "HalfCheetah" elif env_id == 'ant': env = AntEnv() name = "Ant" elif env_id == 'point': env = gym_env("OneDPoint-v0") name = "OneDPoint" elif env_id == 'reacher': env = gym_env("Reacher-v1") name = "Reacher" elif env_id == 'idp': env = InvertedDoublePendulumEnv() name = "InvertedDoublePendulum" elif env_id == 'ocm': env = OneCharMemory(**env_params) name = "OneCharMemory" elif env_id == 'gym': if gym_name == "": raise Exception("Must provide a gym name") env = gym_env(gym_name) name = gym_name else: raise Exception("Unknown env: {0}".format(env_id)) if normalize_env: env = normalize(env) name += "-normalized" return dict( env=env, name=name, was_env_normalized=normalize_env, )
def init(env_name, args): if env_name == 'SparseMountainCar': from rllab_env.sparse_mountain_car import SparseMountainCarEnv env = RLLabWrapper(SparseMountainCarEnv()) elif env_name == 'Ant': from rllab_env.ant_env import AntEnv env = RLLabWrapper(AntEnv(args)) elif env_name == 'AntGather': from rllab_env.ant_gather_env import AntGatherEnv env = RLLabWrapper(AntGatherEnv(args)) elif env_name == 'HalfCheetah': from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv env = RLLabWrapper(HalfCheetahEnv()) elif env_name == 'MountainCar': from rllab.envs.box2d.mountain_car_env import MountainCarEnv env = RLLabWrapper(MountainCarEnv()) elif env_name == 'Cartpole': from rllab.envs.box2d.cartpole_env import CartpoleEnv env = RLLabWrapper(CartpoleEnv()) elif env_name == 'SingleGoal': from mazebase import single_goal from mazebase_env import single_goal as config env = MazeBaseWrapper('SingleGoal', single_goal, config) elif env_name == 'sp_goal': from mazebase_env import sp_goal env = MazeBaseWrapper('sp_goal', sp_goal, sp_goal) elif env_name == 'sp_switch': from mazebase_env import sp_switch config = sp_switch.get_opts_with_args(args) sp_switch.get_opts = lambda: config env = MazeBaseWrapper('sp_switch', sp_switch, sp_switch) elif env_name == 'sp_pick': from mazebase_env import sp_pick env = MazeBaseWrapper('sp_pick', sp_pick, sp_pick) elif "MiniGrid" in env_name: env = MinigridWrapper(env_name) else: raise RuntimeError("wrong env name") return env
def run_task(*_): env = normalize(CartpoleEnv()) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=1000, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable plotting plot=True) algo.train()
def main(): stub(globals()) env = TfEnv(CartpoleEnv()) ddpg_params = dict( batch_size=128, n_epochs=50, epoch_length=1000, eval_samples=1000, discount=0.99, policy_learning_rate=1e-4, qf_learning_rate=1e-3, soft_target_tau=0.01, replay_pool_size=1000000, min_pool_size=256, scale_reward=1.0, max_path_length=1000, qf_weight_decay=0.01, ) es = OUStrategy(env_spec=env.spec) qf = QuadraticNAF( name_or_scope="quadratic_qf", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) algorithm = DDPG(env, es, policy, qf, **ddpg_params) for seed in range(3): env.reset() run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix="test-qddpg-cartpole", seed=seed, )
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.envs.normalized_env import normalize import numpy as np import theano import theano.tensor as TT from rllab.sampler import parallel_sampler from lasagne.updates import sgd from rllab.misc import ext from lasagne.updates import adam import matplotlib.pyplot as plt load_policy = True # normalize() makes sure that the actions for the environment lies # within the range [-1, 1] (only works for environments with continuous actions) env = normalize(CartpoleEnv()) # Initialize a neural network policy with a single hidden layer of 8 hidden units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8, )) parallel_sampler.populate_task(env, policy) # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class # rllab.distributions.DiagonalGaussian dist = policy.distribution # We will collect 100 trajectories per iteration N = 10 # Each trajectory will have at most 100 time steps T = 100 # Number of iterations n_itr = 1000
from __future__ import print_function from __future__ import absolute_import from sandbox.rocky.tf.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.envs.box2d.cartpole_env import CartpoleEnv from rllab.envs.normalized_env import normalize from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.envs.base import TfEnv from rllab.misc.instrument import stub, run_experiment_lite stub(globals()) env = TfEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99,
# misc params parser.add_argument("--debug", type=int, default=0) parser.add_argument("--seed", type=int, default=456) parser.add_argument("--expert_data_path", type=str, default="expert_trajs/racing/Racing-State-0") args = parser.parse_args() if __name__ == "__main__": np.random.seed(args.seed) tf.set_random_seed(args.seed) #env = TfEnv(normalize(CartpoleEnv())) ## normalize or not ? if args.environment == "CartPole": env = TfEnv(CartpoleEnv()) elif args.environment == "Pendulum": env = gym.make("Pendulum-v0") env = TfEnv(env) #t_hidden_sizes = () elif args.environment == "NoisyPendulum": gym.envs.register( id="NoisyPendulum-v0", entry_point='rllab.envs.target_env:NoisyPendulum', timestep_limit=999, reward_threshold=195.0, ) env = TfEnv(GymEnv("NoisyPendulum-v0")) elif args.environment in ["Racing-State", "Racing-State-Action"]: #env = TfEnv(CarRacing(mode="pixels")) if args.environment == "Racing-State":
def get(perm): name = perm["problem"] if name.lower() == "cartpole": from rllab.envs.box2d.cartpole_env import CartpoleEnv return normalize(CartpoleEnv()) elif name.lower() == "mountain car height bonus": from rllab.envs.box2d.mountain_car_env import MountainCarEnv return normalize(MountainCarEnv()) elif name.lower() == "mountain car": from rllab.envs.box2d.mountain_car_env import MountainCarEnv return normalize(MountainCarEnv(height_bonus=0)) elif name.lower() == "gym mountain car": from rllab.envs.gym_env import GymEnv return normalize(GymEnv("MountainCarContinuous-v0", record_video=False)) elif name.lower() == "pendulum": from rllab.envs.gym_env import GymEnv return normalize(GymEnv("Pendulum-v0", record_video=False)) elif name.lower() == "mujoco double pendulum": from rllab.envs.mujoco.inverted_double_pendulum_env import InvertedDoublePendulumEnv return normalize(InvertedDoublePendulumEnv()) elif name.lower() == "double pendulum": from rllab.envs.box2d.double_pendulum_env import DoublePendulumEnv return normalize(DoublePendulumEnv()) elif name.lower() == "hopper": from rllab.envs.mujoco.hopper_env import HopperEnv return normalize(HopperEnv()) elif name.lower() == "swimmer": from rllab.envs.mujoco.swimmer_env import SwimmerEnv return normalize(SwimmerEnv()) elif name.lower() == "2d walker": from rllab.envs.mujoco.walker2d_env import Walker2DEnv return normalize(Walker2DEnv()) elif name.lower() == "half cheetah": from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv return normalize(HalfCheetahEnv()) elif name.lower() == "ant": from rllab.envs.mujoco.ant_env import AntEnv return normalize(AntEnv()) elif name.lower() == "simple humanoid": from rllab.envs.mujoco.simple_humanoid_env import SimpleHumanoidEnv return normalize(SimpleHumanoidEnv()) elif name.lower() == "full humanoid": from rllab.envs.mujoco.humanoid_env import HumanoidEnv return normalize(HumanoidEnv()) else: raise NotImplementedError(f"Environment {name} unknown")
SimpleHumanoidEnv, InvertedDoublePendulumEnv, HopperEnv, HalfCheetahEnv, PointGatherEnv, SwimmerGatherEnv, AntGatherEnv, PointMazeEnv, SwimmerMazeEnv, AntMazeEnv, ]) envs = [cls() for cls in simple_env_classes] envs.append(ProxyEnv(envs[0])) envs.append(IdentificationEnv(CartpoleEnv, {})) envs.append(NoisyObservationEnv(CartpoleEnv())) envs.append(DelayedActionEnv(CartpoleEnv())) envs.append(NormalizedEnv(CartpoleEnv())) envs.append(GymEnv('CartPole-v0')) @tools.params(*envs) def test_env(env): print("Testing", env.__class__) ob_space = env.observation_space act_space = env.action_space ob = env.reset() assert ob_space.contains(ob) a = act_space.sample() assert act_space.contains(a) res = env.step(a)
def setUp(self): super().setUp() self.env = TfEnv(CartpoleEnv()) self.es = OUStrategy(env_spec=self.env.spec)
def __init__(self, num_steps=100, position_only=True): assert position_only, "I only added position_only due to some weird " \ "serialization bug" CartpoleEnv.__init__(self, position_only=position_only) self.num_steps = num_steps
parser.add_argument("--batch_size",type=int,default=40 * 200) parser.add_argument("--environment",type=str,default="Racing-State-Action") parser.add_argument("--normalize",type=int,default=0) parser.add_argument("--recurrent",type=int,default=0) # Network Params parser.add_argument("--hidden_sizes",type=int,nargs="+",default=[32,32,32,16]) parser.add_argument("--nonlinearity",type=str,default="tanh") args = parser.parse_args() #env = TfEnv(normalize(CartpoleEnv())) ## normalize or not ? nonlin = {"relu":tf.nn.relu,"tanh":tf.nn.tanh,"elu":tf.nn.elu}[args.nonlinearity] if args.environment == "CartPole": env = CartpoleEnv() elif args.environment == "Pendulum": env = gym.make("Pendulum-v0") elif args.environment == "Racing-State": env = CarRacing(mode='state',features=args.features) elif args.environment == "Racing-State-Action": env = CarRacing(mode='state_action',features=args.features) env = TfEnv(env) if args.normalize: assert False if args.recurrent: feat_net = MLP("feat_net", env.observation_space.shape, args.hidden_sizes[-1], args.hidden_sizes[:-1], nonlin, nonlin) policy = GaussianGRUPolicy("policy", env_spec=env.spec, hidden_dim=32, feature_network=feat_net,
############################################################## if __name__ == '__main__': Transition = collections.namedtuple('Transition', ('state', 'action', 'reward')) experiments = 5 ALL_REWARDS = [] for i in range(experiments): REWARDS = [] from rllab.envs.box2d.cartpole_env import CartpoleEnv env = Rllab2GymWrapper(CartpoleEnv()) # set_all_seeds(0) # N = batch size, B = mini batch size, m = sub iteration agent = Agent(4, 1, N = 10, B = 5, m = 2) agent.train(episodes = int(2000/20), horizon = 100, max_reward = 900) ALL_REWARDS.append(REWARDS) ALL_REWARDS = np.mean(np.array(ALL_REWARDS), axis = 0) np.savetxt("cartpole-spider-policy-2000t-mean5.csv", np.transpose( np.array(ALL_REWARDS) ), delimiter = ',')