Пример #1
0
def create_policy_rllab(policy, env, weights):
    # Create policy
    obs_dim = env.observation_space.flat_dim
    action_dim = env.action_space.flat_dim
    if policy == 'linear':
        hidden_sizes = tuple()
    elif policy == 'simple-nn':
        hidden_sizes = [16]
    else:
        raise Exception('NOT IMPLEMENTED.')
    # Creating the policy
    mean_network = MLP(
        input_shape=(obs_dim, ),
        output_dim=action_dim,
        hidden_sizes=hidden_sizes,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        output_b_init=None,
        output_W_init=LI.Normal(),
    )
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=hidden_sizes,
        mean_network=mean_network)
    # Set the weights
    if weights is not None:
        raise Exception('TODO load pickle file.')
    else:
        weights = WEIGHTS
    policy.set_param_values(weights)
    return policy
Пример #2
0
def run_task(*_):
    env = normalize(
        GymEnv("DartHopper-v1", record_log=False, record_video=False))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(128, 64),
        net_mode=0,
    )

    print('trainable parameter size: ',
          policy.get_param_values(trainable=True).shape)

    baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0)

    algo = PPO_Clip_Sym(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=20000,
        max_path_length=env.horizon,
        n_itr=200,
        discount=0.99,
        step_size=0.02,
        gae_lambda=0.97,
        whole_paths=False,
        observation_permutation=np.array(
            [0.0001, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
        action_permutation=np.array([0.0001, 1, 2]),
        sym_loss_weight=0.0,
    )
    algo.train()
Пример #3
0
def run_multi_vpg_stein_no_critic(*_):
    env = normalize(env_name())
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(25, 16,),
        adaptive_std=False,
    )
    policy_list = [GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(25, 16,),
        adaptive_std=False,
    ) for i in range(num_of_agents)]
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    baseline_list = [LinearFeatureBaseline(env_spec=env.spec) for i in range(num_of_agents)]
    print("Iteration Number: {:}".format(n_itr))
    print("Learning Rate : {:}".format(learning_rate))
    algo = VPG_multi_Stein(
        num_of_agents = num_of_agents,
        temp = temperature,
        env=env,
        policy=policy,
        policy_list = policy_list,
        baseline=baseline,
        baseline_list=baseline_list,
        batch_size=batch_size,
        max_path_length=500,
        n_itr=n_itr,
        discount=0.99,
        learning_rate = learning_rate,
        with_critic = True,
    )
    algo.train()
Пример #4
0
def test_trpo_deterministic_nan():
    env = DummyEnv()
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(1,))
    policy._l_log_std.param.set_value([np.float32(np.log(1e-8))])
    baseline = ZeroBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env, policy=policy, baseline=baseline, n_itr=10, batch_size=1000, max_path_length=100,
        step_size=0.01
    )
    algo.train()
    assert not np.isnan(np.sum(policy.get_param_values()))
Пример #5
0
def test_trpo_relu_nan():
    env = DummyEnv()
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_nonlinearity=naive_relu,
        hidden_sizes=(1,))
    baseline = ZeroBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env, policy=policy, baseline=baseline, n_itr=1, batch_size=1000, max_path_length=100,
        step_size=0.001
    )
    algo.train()
    assert not np.isnan(np.sum(policy.get_param_values()))
def rllab_trpo_launcher(variant):
	from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
	from rllab.algos.trpo import TRPO
	from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
	

	from railrl.launchers.launcher_util import get_env_settings
	import lasagne.nonlinearities as NL

	env_settings = get_env_settings(**variant['env_params'])
	env = env_settings['env']
	policy = GaussianMLPPolicy(
		env_spec=env.spec,
		hidden_sizes=(64, 32),
		output_nonlinearity=NL.tanh,
	)
	baseline = LinearFeatureBaseline(
		env.spec,
	)
	batch_size = 5000
	algorithm = TRPO(
		env=env,
		policy=policy,
		baseline=baseline,
		whole_paths=True,
		max_path_length=500,
		n_itr=1000,
		step_size=0.01,
		subsample_factor=1.0,
	)
	algorithm.train()
Пример #7
0
def run_task(*_):
    # Please note that different environments with different action spaces may
    # require different policies. For example with a Discrete action space, a
    # CategoricalMLPPolicy works, but for a Box action space may need to use
    # a GaussianMLPPolicy (see the trpo_gym_pendulum.py example)
    env = normalize(
        GymEnv(env_name="LunarLanderContinuous-v2", force_reset=True))
    # policy = CategoricalMLPPolicy(
    #     env_spec=env.spec,
    #     # The neural network policy should have two hidden layers, each with 32 hidden units.
    #     hidden_sizes=(32, 32)
    # )
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(64, 64))

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    # max_path_length = env.horizon
    algo = PPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=300,
        n_itr=10000,
        discount=0.99,
        # step_size=0.02,
        truncate_local_is_ratio=0.2
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
Пример #8
0
def run_task(*_):
    env = normalize(Gomoku())

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(100, 50, 25),
        output_nonlinearity=NL.tanh)

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=20000,
        max_path_length=110,
        n_itr=500,
        discount=0.995,
        step_size=0.01,
        gae_lambda=0.97,
        #epopt_epsilon = 1.0,
        #epopt_after_iter = 0,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
Пример #9
0
def run_vpg_baseline_large_batch_size_no_critic(*_):
    env = normalize(env_name())
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(
            100,
            50,
            25,
        ),
        adaptive_std=False,
    )
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    print("Iteration Number: {:}".format(n_itr))
    print("Learning Rate : {:}".format(learning_rate))
    algo = VPG(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=batch_size * num_of_agents,
        max_path_length=500,
        n_itr=n_itr,
        discount=0.99,
        optimizer_args={'learning_rate': learning_rate},
        sampler_cls=BatchSampler_no_critic,
    )
    algo.train()
Пример #10
0
def run_task(*_):
    env_name = "BottleneckEnv"
    pass_params = (env_name, sumo_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    horizon = env.horizon
    env = normalize(env)

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(100, 50, 25))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=20000,
        max_path_length=horizon,
        # whole_paths=True,
        n_itr=400,
        discount=0.995,
        # step_size=0.01,
    )
    algo.train()
def run_task(*_):
    env = normalize(TendonTwoSegmentSE3Env())

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64)
        #         output_nonlinearity=NL.tanh
        #  std_hidden_nonlinearity=NL.rectify,
        #  hidden_nonlinearity=NL.rectify,
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=10,
        max_path_length=np.inf,
        n_itr=10,
        discount=0.99,
        step_size=0.01,
        gae_lambda=1,
    )
    algo.train()
def run_task(vv, log_dir=None, exp_name=None):

    # Load environment
    radius = vv['radius']
    target_velocity = vv['target_velocity']
    env = normalize(CircleEnv(radius, target_velocity))

    # Save variant information for comparison plots
    variant_file = logger.get_snapshot_dir() + '/variant.json'
    logger.log_variant(variant_file, vv)

    # Train policy using TRPO
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(32, 32)
    )
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=1000,
        max_path_length=env.horizon,
        n_itr=1000,
        discount=0.99,
        step_size=0.01,
        plot=False,
    )
    algo.train()
Пример #13
0
def run_task(*_):
    # Please note that different environments with different action spaces may require different
    # policies. For example with a Box action space, a GaussianMLPPolicy works, but for a Discrete
    # action space may need to use a CategoricalMLPPolicy (see the trpo_gym_cartpole.py example)
    env = normalize(GymEnv("Pendulum-v0", record_video=False,
                           force_reset=True))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=50,
        discount=0.99,
        step_size=0.01,
        optimizer=ConjugateGradientOptimizer(
            hvp_approach=FiniteDifferenceHvp(base_eps=1e-5, symmetric=False))
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
Пример #14
0
def run_task(_):
    env_name = "PlatooningEnv"

    register(
        id=env_name+'-v0',
        entry_point='platooning_env:{}'.format(env_name),
        max_episode_steps=HORIZON,
        kwargs={"env_params": ENV_PARAMS}
    )

    env = GymEnv(env_name, record_video=False)
    horizon = env.horizon
    env = normalize(env)

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(16, 16, 16),
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=15000,
        max_path_length=horizon,
        n_itr=1000,
        # whole_paths=True,
        discount=0.999,
    )
    algo.train(),
def run_task(*_):
    # env = normalize(HalfCheetahEnv())

    env = GymEnv(env_name = "MountainCarContinuous-v0", force_reset=True)

    # baseline = LinearFeatureBaseline(env_spec=env.spec)
    baseline = ZeroBaseline(env_spec=env.spec)
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers
        hidden_sizes=(64, 64)
    )

    algo = VPG(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=100,
        max_path_length=100,
        n_itr=10000,
        discount=0.99,
        optimizer_args=dict(
            learning_rate=0.01,
        )
    )
    algo.train()
Пример #16
0
def run_task(*_):
    # Please note that different environments with different action spaces may require different
    # policies. For example with a Box action space, a GaussianMLPPolicy works, but for a Discrete
    # action space may need to use a CategoricalMLPPolicy (see the trpo_gym_cartpole.py example)
    env = normalize(GymEnv("Reacher-v1"))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=200,
        discount=0.99,
        step_size=0.01,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
Пример #17
0
def run_task(*_):
    env = normalize(Cassie2dEnv())

    if load_policy:
        filename = "123"
        data = joblib.load(filename)
        policy = data['policy']
        print("Loading Pretrained Policy ...............................")
    else:
        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            # The neural network policy should have two hidden layers, each with 32 hidden units.
            hidden_sizes=(32, 32),
            init_std=1.0,
            #adaptive_std=True,
        )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = VPG(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=10000,
        max_path_length=1000,  # dt = (1/2000)*n, where n is Step(n)
        n_itr=400,
        discount=0.99,
        step_size=0.005,  # default was 0.01
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        plot=False,
    )
    algo.train()
Пример #18
0
def run_task(*_):
        f = open('/home/qingkai/verina.csv', "w+")
        ff = open('/home/qingkai/cpo_dual.csv', "w+")
        trpo_stepsize = 0.01
        trpo_subsample_factor = 0.2
        
        env = AntGatherEnv(apple_reward=10,bomb_cost=1,n_apples=2, activity_range=6)

        policy = GaussianMLPPolicy(env.spec,
                    hidden_sizes=(64,32)
                 )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args={
                    'hidden_sizes': (64,32),
                    'hidden_nonlinearity': NL.tanh,
                    'learn_std':False,
                    'step_size':trpo_stepsize,
                    'optimizer':ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor)
                    }
        )

        safety_baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args={
                    'hidden_sizes': (64,32),
                    'hidden_nonlinearity': NL.tanh,
                    'learn_std':False,
                    'step_size':trpo_stepsize,
                    'optimizer':ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor)
                    },
            target_key='safety_returns',
            )

        safety_constraint = GatherSafetyConstraint(max_value=0.2, baseline=safety_baseline)



        algo = CPO(
            env=env,
            policy=policy,
            baseline=baseline,
            safety_constraint=safety_constraint,
            safety_gae_lambda=0.5,
            batch_size=100000,
            max_path_length=500,
            n_itr=2000,
            gae_lambda=0.95,
            discount=0.995,
            step_size=trpo_stepsize,
            optimizer_args={'subsample_factor':trpo_subsample_factor},
            #plot=True,
        )

        algo.train()

        f.close()
        ff.close()
def train(env, policy, policy_init, num_episodes, episode_cap, horizon,
          **alg_args):

    # Getting the environment
    env_class = rllab_env_from_name(env)
    env = normalize(env_class())

    # Policy initialization
    if policy_init == 'zeros':
        initializer = LI.Constant(0)
    elif policy_init == 'normal':
        initializer = LI.Normal()
    else:
        raise Exception('Unrecognized policy initialization.')

    # Setting the policy type
    if policy == 'linear':
        hidden_sizes = tuple()
    elif policy == 'simple-nn':
        hidden_sizes = [16]
    else:
        raise Exception('NOT IMPLEMENTED.')

    # Creating the policy
    obs_dim = env.observation_space.flat_dim
    action_dim = env.action_space.flat_dim
    mean_network = MLP(
        input_shape=(obs_dim, ),
        output_dim=action_dim,
        hidden_sizes=hidden_sizes,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        output_b_init=None,
        output_W_init=initializer,
    )
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=hidden_sizes,
        mean_network=mean_network,
        log_weights=True,
    )

    # Creating baseline
    baseline = LinearFeatureBaseline(env_spec=env.spec)

    # Adding max_episodes constraint. If -1, this is unbounded
    if episode_cap:
        alg_args['max_episodes'] = num_episodes

    # Run algorithm
    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=horizon * num_episodes,
                whole_paths=True,
                max_path_length=horizon,
                **alg_args)
    algo.train()
Пример #20
0
def run_task(*_):
    sumo_params = SumoParams(sim_step=0.1,
                             sumo_binary="sumo")

    vehicles = Vehicles()
    vehicles.add(veh_id="rl",
                 acceleration_controller=(RLController, {}),
                 routing_controller=(ContinuousRouter, {}),
                 speed_mode="no_collide",
                 num_vehicles=1)
    vehicles.add(veh_id="idm",
                 acceleration_controller=(IDMController, {"noise": 0.2}),
                 routing_controller=(ContinuousRouter, {}),
                 speed_mode="no_collide",
                 num_vehicles=13)

    additional_env_params = {"target_velocity": 20,
                             "max_accel": 3, "max_decel": 3}
    env_params = EnvParams(horizon=HORIZON,
                           additional_params=additional_env_params)

    additional_net_params = {"radius_ring": 30, "lanes": 1, "speed_limit": 30,
                             "resolution": 40}
    net_params = NetParams(no_internal_links=False,
                           additional_params=additional_net_params)

    initial_config = InitialConfig(spacing="uniform")

    print("XXX name", exp_tag)
    scenario = Figure8Scenario(exp_tag, Figure8Generator, vehicles, net_params,
                               initial_config=initial_config)

    env_name = "AccelEnv"
    pass_params = (env_name, sumo_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    horizon = env.horizon
    env = normalize(env)

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(16, 16)
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=15000,
        max_path_length=horizon,
        n_itr=500,
        # whole_paths=True,
        discount=0.999,
        # step_size=v["step_size"],
    )
    algo.train(),
Пример #21
0
def test_baseline(baseline_cls):
    env = CartpoleEnv()
    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(6,))
    baseline = baseline_cls(env_spec=env.spec)
    algo = VPG(
        env=env, policy=policy, baseline=baseline,
        n_itr=1, batch_size=1000, max_path_length=100
    )
    algo.train()
Пример #22
0
def run_task(*_):
    env = normalize(
        GymEnv("DartWalker3d-v1", record_log=False, record_video=False))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(128, 64),
        net_mode=0,
    )
    #policy = joblib.load('data/local/experiment/walker3d_symmetry1_sd13_2alivebonus_2velrew_targetvelocity1_15frameskip_5en1absenergypenalty_2d_hardvelenforce_contsupport/policy.pkl')

    # increase policy std a bit for exploration
    #policy.get_params()[-1].set_value(policy.get_params()[-1].get_value() + 0.5)

    print('trainable parameter size: ',
          policy.get_param_values(trainable=True).shape)

    baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0)


    algo = TRPO_Symmetry(
        env=env,
        policy=policy,
        baseline=baseline,

        batch_size=60000,

        max_path_length=env.horizon,
        n_itr=500,

        discount=0.99,
        step_size=0.02,
        gae_lambda=0.97,
        observation_permutation=np.array([0.0001,-1, 2,-3,-4, -5,-6,7, 14,-15,-16, 17, 18,-19, 8,-9,-10, 11, 12,-13,\
                                          20,21,-22, 23,-24,-25, -26,-27,28, 35,-36,-37, 38, 39,-40, 29,-30,-31, 32, 33,-34, 42, 41]),
        #observation_permutation=np.array([0.0001, 1, 5,6,7, 2,3,4, 8,9,10, 14,15,16, 11,12,13]),
        #action_permutation=np.array([3,4,5, 0.00001,1,2]),
        action_permutation=np.array([-0.0001, -1, 2, 9, -10, -11, 12, 13, -14, 3, -4, -5, 6, 7, -8]),

        sym_loss_weight=2.0,
        whole_paths=False,
    )
    algo.train()
Пример #23
0
def run_task(*_):
    env = normalize(
        GymEnv("DartHumanWalker-v1", record_log=False, record_video=False))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(128, 64),
        net_mode=0,
    )
    #policy = joblib.load('data/local/experiment/humanwalker_symmetry1_sd11_1alivebonus_2velrew_targetvelocity1_15frameskip_5en1absenergypenalty_spd20002000/policy.pkl')

    # increase policy std a bit for exploration
    #policy.get_params()[-1].set_value(policy.get_params()[-1].get_value() + 0.5)

    print('trainable parameter size: ',
          policy.get_param_values(trainable=True).shape)

    baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0)


    algo = TRPO_Symmetry(
        env=env,
        policy=policy,
        baseline=baseline,

        batch_size=50000,

        max_path_length=env.horizon,
        n_itr=1000,

        discount=0.99,
        step_size=0.02,
        gae_lambda=0.97,
        observation_permutation=np.array([0.0001,-1,2,-3,-4, -11,12,-13,14,15,16, -5,6,-7,8,9,10, -17,18, -19, -24,25,-26,27, -20,21,-22,23,\
                                          28,29,-30,31,-32,-33, -40,41,-42,43,44,45, -34,35,-36,37,38,39, -46,47, -48, -53,54,-55,56, -49,50,-51,52, 58,57]),
        action_permutation=np.array([-6,7,-8, 9, 10,11,  -0.001,1,-2, 3, 4,5, -12,13, -14, -19,20,-21,22, -15,16,-17,18]),

        sym_loss_weight=1.0,
        action_reg_weight=0.0,
        whole_paths=False,
    )
    algo.train()
Пример #24
0
def run_task(v):

    which_agent = v["which_agent"]
    env, _ = create_env(which_agent)
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    optimizer_params = dict(base_eps=1e-5)

    #how many iters
    num_trpo_iters = 2500
    if (which_agent == 1):
        num_trpo_iters = 2500
    if (which_agent == 2):
        steps_per_rollout = 333
        num_trpo_iters = 200
    if (which_agent == 4):
        num_trpo_iters = 2000
    if (which_agent == 6):
        num_trpo_iters = 2000

    #recreate the policy
    policy = GaussianMLPPolicy(env_spec=env.spec,
                               hidden_sizes=(v["depth_fc_layers"],
                                             v["depth_fc_layers"]),
                               init_std=v["std_on_mlp_policy"])
    all_params = np.concatenate(
        (v["policy_values"], policy._l_log_std.get_params()[0].get_value()))
    policy.set_param_values(all_params)

    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v["trpo_batchsize"],
                max_path_length=v["steps_per_rollout"],
                n_itr=num_trpo_iters,
                discount=0.995,
                optimizer=v["ConjugateGradientOptimizer"](
                    hvp_approach=v["FiniteDifferenceHvp"](**optimizer_params)),
                step_size=0.05,
                plot_true=True)

    #train the policy
    algo.train()
Пример #25
0
def buildPolicy(polValTpl, env):
    #directory to save results for this run - built off policy architecture
    snapShotDir = 'snapShots' + '_'.join([str(x) for x in polValTpl])
    print('snapShotDir : ' + snapShotDir)

    if not os.path.exists(snapShotDir):
        os.makedirs(snapShotDir)

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=polValTpl)

    return policy, snapShotDir
Пример #26
0
def run_task(*_):
    env = normalize(PointEnv())
    policy = GaussianMLPPolicy(env_spec=env.spec,)
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=500,discount=0.99,step_size=0.01
    )
    algo.train()
Пример #27
0
 def test_issue_3():
     """
     As reported in https://github.com/rllab/rllab/issues/3, the adaptive_std parameter was not functioning properly
     """
     env = CartpoleEnv()
     policy = GaussianMLPPolicy(env_spec=env, adaptive_std=True)
     baseline = ZeroBaseline(env_spec=env.spec)
     algo = TRPO(env=env,
                 policy=policy,
                 baseline=baseline,
                 batch_size=100,
                 n_itr=1)
     algo.train()
Пример #28
0
def run_task(*_):
    """Implement the ``run_task`` method needed to run experiments with rllab.

    Note that the flow-specific parameters are imported at the start of this
    script and unzipped and processed here.
    """
    env_name = flow_params["env_name"]
    exp_tag = flow_params["exp_tag"]
    sumo_params = flow_params["sumo"]
    vehicles = flow_params["veh"]
    env_params = flow_params["env"]
    net_params = flow_params["net"]
    initial_config = flow_params.get("initial", InitialConfig())
    traffic_lights = flow_params.get("tls", TrafficLights())

    # import the scenario and generator classes
    module = __import__("flow.scenarios", fromlist=[flow_params["scenario"]])
    scenario_class = getattr(module, flow_params["scenario"])
    module = __import__("flow.scenarios", fromlist=[flow_params["generator"]])
    generator_class = getattr(module, flow_params["generator"])

    # create the scenario object
    scenario = scenario_class(name=exp_tag,
                              generator_class=generator_class,
                              vehicles=vehicles,
                              net_params=net_params,
                              initial_config=initial_config,
                              traffic_lights=traffic_lights)

    pass_params = (env_name, sumo_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    env = normalize(env)

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(100, 50, 25))

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    horizon = flow_params["env"].horizon

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=horizon * (N_ROLLOUTS - PARALLEL_ROLLOUTS + 1),
        max_path_length=horizon,
        n_itr=500,
        discount=0.999,
        step_size=0.01,
    )
    algo.train(),
Пример #29
0
def run_task(v):

    print("_________________________________")
    print("#################################")
    print("_________________________________")
    print("_________________________________")
    print("#################################")
    print("###    agents_number : " + str(agents_number) + "    ####")
    print("###                          ####")
    print("### participation_rate : " + str(participation_rate) + " ####")
    print("###                          ####")
    print("###    average_period : " + str(average_period) + "   ####")
    print("###                          ####")
    print("### quantization_tuning : " + str(quantization_tuning) + " ####")
    print("###                          ####")
    print("###     discount : " + str(discount) + "      ####")
    print("#################################")
    print("_________________________________")
    print("_________________________________")
    print("#################################")
    print("_________________________________")

    env = normalize(CartpoleEnv())

    policy = GaussianMLPPolicy(env_spec=env.spec)

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = Server(
        participation_rate=participation_rate,
        agents_number=agents_number,
        average_period=average_period,
        env=env,
        policy=policy,
        baseline=baseline,
        difference_params=True,
        quantize=True,
        quantization_tuning=quantization_tuning,
        batch_size=400,
        max_path_length=100,
        n_itr=50,
        discount=discount,
        step_size=0.01,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )

    algo.train()
Пример #30
0
def run_task(*_):
    tot_cars = 6
    auton_cars = 6

    sumo_params = SumoParams(time_step=0.1,  rl_speed_mode="no_collide", sumo_binary="sumo-gui")

    vehicles = Vehicles()
    vehicles.add_vehicles("rl", (RLController, {}), (StaticLaneChanger, {}), (ContinuousRouter, {}), 0, auton_cars)

    env_params = EnvParams(additional_params={"target_velocity": 25, "num_steps": 1000})

    additional_net_params = {"length": 220, "lanes": 1, "speed_limit": 30, "resolution": 40}
    net_params = NetParams(additional_params=additional_net_params)

    initial_config = InitialConfig()

    scenario = LoopScenario("rl-test", CircleGenerator, vehicles, net_params, initial_config)

    env_name = "SimpleAccelerationEnvironment"
    pass_params = (env_name, sumo_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    horizon = env.horizon
    env = normalize(env)

    logging.info("Experiment Set Up complete")

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(16,)
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=2000,
        max_path_length=horizon,
        # whole_paths=True,
        n_itr=2,  # 1000
        # discount=0.99,
        # step_size=0.01,
    )
    algo.train()
Пример #31
0
from rllab.envs.box2d.cartpole_env import CartpoleEnv
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
from rllab.envs.normalized_env import normalize
import numpy as np
import theano
import theano.tensor as TT
from lasagne.updates import adam

# normalize() makes sure that the actions for the environment lies
# within the range [-1, 1] (only works for environments with continuous actions)
env = normalize(CartpoleEnv())
# Initialize a neural network policy with a single hidden layer of 8 hidden units
policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8,))
# Initialize a linear baseline estimator using default hand-crafted features
baseline = LinearFeatureBaseline(env.spec)

# We will collect 100 trajectories per iteration
N = 100
# Each trajectory will have at most 100 time steps
T = 100
# Number of iterations
n_itr = 100
# Set the discount factor for the problem
discount = 0.99
# Learning rate for the gradient update
learning_rate = 0.1

# Construct the computation graph
Пример #32
0
def doit(mode):
    from rllab.envs.box2d.cartpole_env import CartpoleEnv
    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
    from rllab.baselines.zero_baseline import ZeroBaseline
    from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
    from rllab.envs.normalized_env import normalize
    import numpy as np
    import theano
    import theano.tensor as TT
    from lasagne.updates import adam

    # normalize() makes sure that the actions for the environment lies
    # within the range [-1, 1] (only works for environments with continuous actions)
    env = normalize(CartpoleEnv())
    # Initialize a neural network policy with a single hidden layer of 8 hidden units
    policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8,))
    # Initialize a linear baseline estimator using default hand-crafted features
    if "linbaseline" in mode:
        print('linear baseline')
        baseline = LinearFeatureBaseline(env.spec)
    elif "vanilla" in mode:
        print("zero baseline")
        baseline = ZeroBaseline(env.spec)
    elif mode == "batchavg":
        print('batch average baseline')
        # use a zero baseline but subtract the mean of the discounted returns (see below)
        baseline = ZeroBaseline(env.spec)

    if "_ztrans" in mode:
        print('z transform advantages')
    else:
        print('no z transform')


    # We will collect 100 trajectories per iteration
    N = 50
    # Each trajectory will have at most 100 time steps
    T = 50
    # Number of iterations
    n_itr = 50
    # Set the discount factor for the problem
    discount = 0.99
    # Learning rate for the gradient update
    learning_rate = 0.1

    # Construct the computation graph

    # Create a Theano variable for storing the observations
    # We could have simply written `observations_var = TT.matrix('observations')` instead for this example. However,
    # doing it in a slightly more abstract way allows us to delegate to the environment for handling the correct data
    # type for the variable. For instance, for an environment with discrete observations, we might want to use integer
    # types if the observations are represented as one-hot vectors.
    observations_var = env.observation_space.new_tensor_variable(
        'observations',
        # It should have 1 extra dimension since we want to represent a list of observations
        extra_dims=1
    )
    actions_var = env.action_space.new_tensor_variable(
        'actions',
        extra_dims=1
    )
    advantages_var = TT.vector('advantages')

    # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the
    # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation.
    dist_info_vars = policy.dist_info_sym(observations_var)

    # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing
    # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute
    # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class
    # rllab.distributions.DiagonalGaussian
    dist = policy.distribution

    # Note that we negate the objective, since most optimizers assume a
    # minimization problem
    surr = - TT.mean(dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var)

    # Get the list of trainable parameters.
    params = policy.get_params(trainable=True)
    grads = theano.grad(surr, params)

    f_train = theano.function(
        inputs=[observations_var, actions_var, advantages_var],
        outputs=None,
        updates=adam(grads, params, learning_rate=learning_rate),
        allow_input_downcast=True
    )

    results = []
    for _ in range(n_itr):

        paths = []

        for _ in range(N):
            observations = []
            actions = []
            rewards = []

            observation = env.reset()

            for _ in range(T):
                # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains
                # sufficient statistics for the action distribution. It should at least contain entries that would be
                # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym().
                # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is
                # not needed.
                action, _ = policy.get_action(observation)
                # Recall that the last entry of the tuple stores diagnostic information about the environment. In our
                # case it is not needed.
                next_observation, reward, terminal, _ = env.step(action)
                observations.append(observation)
                actions.append(action)
                rewards.append(reward)
                observation = next_observation
                if terminal:
                    # Finish rollout if terminal state reached
                    break

            # We need to compute the empirical return for each time step along the
            # trajectory
            path = dict(
                observations=np.array(observations),
                actions=np.array(actions),
                rewards=np.array(rewards),
            )
            path_baseline = baseline.predict(path)
            advantages = []
            returns = []
            return_so_far = 0
            for t in range(len(rewards) - 1, -1, -1):
                return_so_far = rewards[t] + discount * return_so_far
                returns.append(return_so_far)
                advantage = return_so_far - path_baseline[t]
                advantages.append(advantage)
            # The advantages are stored backwards in time, so we need to revert it
            advantages = np.array(advantages[::-1])
            # And we need to do the same thing for the list of returns
            returns = np.array(returns[::-1])

            if "_ztrans" in mode:
                advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)


            path["advantages"] = advantages
            path["returns"] = returns

            paths.append(path)

        baseline.fit(paths)

        observations = np.concatenate([p["observations"] for p in paths])
        actions = np.concatenate([p["actions"] for p in paths])
        advantages = np.concatenate([p["advantages"] for p in paths])


        if mode == 'batchavg':
            # in this case `advantages` up to here are just our good old returns, without baseline or z transformation.
            # now we subtract their mean across all episodes.
            advantages = advantages - np.mean(advantages)


        f_train(observations, actions, advantages)
        avgr =  np.mean([sum(p["rewards"]) for p in paths])
        print(('Average Return:',avgr))
        results.append(avgr)
    return results