예제 #1
0
def run_task(*_):
    # Please note that different environments with different action spaces may require different
    # policies. For example with a Box action space, a GaussianMLPPolicy works, but for a Discrete
    # action space may need to use a CategoricalMLPPolicy (see the trpo_gym_cartpole.py example)
    env = normalize(GymEnv("Pendulum-v0"))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32)
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=50,
        discount=0.99,
        step_size=0.01,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
예제 #2
0
 def test_issue_3():
     """
     As reported in https://github.com/rllab/rllab/issues/3, the adaptive_std parameter was not functioning properly
     """
     env = CartpoleEnv()
     policy = GaussianMLPPolicy(env_spec=env, adaptive_std=True)
     baseline = ZeroBaseline(env_spec=env.spec)
     algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=100, n_itr=1)
     algo.train()
예제 #3
0
def test_trpo_relu_nan():
    env = DummyEnv()
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_nonlinearity=naive_relu,
        hidden_sizes=(1,))
    baseline = ZeroBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env, policy=policy, baseline=baseline, n_itr=1, batch_size=1000, max_path_length=100,
        step_size=0.001
    )
    algo.train()
    assert not np.isnan(np.sum(policy.get_param_values()))
예제 #4
0
def test_trpo_deterministic_nan():
    env = DummyEnv()
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(1,))
    policy._l_log_std.param.set_value([np.float32(np.log(1e-8))])
    baseline = ZeroBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env, policy=policy, baseline=baseline, n_itr=10, batch_size=1000, max_path_length=100,
        step_size=0.01
    )
    algo.train()
    assert not np.isnan(np.sum(policy.get_param_values()))
예제 #5
0
파일: grid_world.py 프로젝트: hans/praglang
def run_experiment(**params):
    base_params = copy.copy(DEFAULTS)
    base_params.update(params)
    params = base_params
    pprint(params)

    grid_world = SlaveGridWorldEnv("walled_chain",
                                   max_traj_length=DEFAULTS["max_path_length"],
                                   goal_reward=params["goal_reward"])
    agent = GridWorldMasterAgent(grid_world, match_reward=params["match_reward"])
    env = normalize(SituatedConversationEnvironment(env=grid_world, b_agent=agent))
    baseline = LinearFeatureBaseline(env)

    policy = RecurrentCategoricalPolicy(
            name="policy",
            env_spec=env.spec,
            hidden_dims=params["policy_hidden_dims"],
            feature_network=MLPNetworkWithEmbeddings(
                "feature_network", env.observation_space.flat_dim,
                params["feature_dim"], params["feature_hidden_dims"],
                tf.tanh, tf.tanh, agent.vocab_size, params["embedding_dim"]),
            state_include_action=False,
    )

    optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))

    algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=params["batch_size"],
            max_path_length=params["max_path_length"],
            n_itr=params["n_itr"],
            discount=0.99,
            step_size=params["step_size"],
            optimizer=optimizer,
    )

    run_experiment_lite(
            algo.train(),
            n_parallel=15,
            snapshot_mode="last",
            exp_prefix="grid_world_sweep3",
            variant=params,
    )
예제 #6
0
def run_task(*_):
    env = normalize(GymEnv("Pendulum-v0"))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(32, 32)
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=50,
        discount=0.99,
        step_size=0.01,
        plot=True,
    )
    algo.train()
예제 #7
0
def run_experiment(**params):
    base_params = copy.copy(DEFAULTS)
    base_params.update(params)
    params = base_params

    grid_world = SlaveGridWorldEnv("3x3", goal_reward=params["goal_reward"])
    env = normalize(grid_world)
    baseline = LinearFeatureBaseline(env)

    policy = CategoricalMLPPolicy(
            name="policy",
            env_spec=env.spec,
            hidden_sizes=params["policy_hidden_dims"],
    )

    optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))

    algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=params["batch_size"],
            max_path_length=5,
            n_itr=params["n_itr"],
            discount=0.99,
            step_size=params["step_size"],
            optimizer=optimizer,
    )

    run_experiment_lite(
            algo.train(),
            n_parallel=5,
            snapshot_mode="last",
            exp_prefix="grid_world_silent",
            variant=params,
    )
예제 #8
0
def run_task(v):
    env = normalize(CartpoleEnv())

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32)
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=40,
        discount=0.99,
        step_size=v["step_size"],
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
def run_task(*_):
    auton_cars = 20

    sumo_params = SumoParams(time_step=0.1,
                             human_speed_mode="no_collide",
                             rl_speed_mode="no_collide",
                             sumo_binary="sumo-gui")

    vehicles = Vehicles()
    vehicles.add_vehicles("idm", (RLController, {}), None, None, 0, 20)

    intensity = .2
    v_enter = 10
    env_params = EnvParams(additional_params={
        "target_velocity": v_enter,
        "control-length": 150,
        "max_speed": v_enter
    })

    additional_net_params = {
        "horizontal_length_in": 400,
        "horizontal_length_out": 800,
        "horizontal_lanes": 1,
        "vertical_length_in": 400,
        "vertical_length_out": 800,
        "vertical_lanes": 1,
        "speed_limit": {
            "horizontal": v_enter,
            "vertical": v_enter
        }
    }
    net_params = NetParams(no_internal_links=False,
                           additional_params=additional_net_params)

    cfg_params = {"start_time": 0, "end_time": 3000, "cfg_path": "debug/cfg/"}

    initial_config = InitialConfig(spacing="custom",
                                   additional_params={
                                       "intensity": intensity,
                                       "enter_speed": v_enter
                                   })

    scenario = TwoWayIntersectionScenario("two-way-intersection",
                                          TwoWayIntersectionGenerator,
                                          vehicles,
                                          net_params,
                                          initial_config=initial_config)

    env = TwoIntersectionEnvironment(env_params, sumo_params, scenario)
    env_name = "TwoIntersectionEnvironment"
    pass_params = (env_name, sumo_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    horizon = env.horizon
    env = normalize(env)
    logging.info("Experiment Set Up complete")

    print("experiment initialized")

    env = normalize(env)

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=30000,
        max_path_length=horizon,
        # whole_paths=True,
        n_itr=200,
        discount=0.999,
        # step_size=0.01,
    )
    algo.train()
                policy = CategoricalMLPPolicy(env_spec=env.spec, )

                baseline = LinearFeatureBaseline(env_spec=env.spec)

                # bonus_evaluators = [GridBonusEvaluator(mesh_density=mesh_density, visitation_bonus=1, snn_H_bonus=0)]
                # reward_coef_bonus = [reward_coef]

                algo = TRPO(
                    env=env,
                    policy=policy,
                    baseline=baseline,
                    self_normalize=True,
                    log_deterministic=True,
                    # reward_coef=reward_coef,
                    # bonus_evaluator=bonus_evaluators,
                    # reward_coef_bonus=reward_coef_bonus,
                    batch_size=1e6 / time_step_agg,
                    whole_paths=True,
                    max_path_length=1e4 / time_step_agg * maze_size_scaling /
                    2.,  # correct for larger envs
                    n_itr=200,
                    discount=0.99,
                    step_size=0.01,
                )

                for s in [10, 20,
                          30]:  # range(10, 110, 10):  # [10, 20, 30, 40, 50]:
                    exp_prefix = 'hier-snn-egoSnake-maze0'
                    now = datetime.datetime.now(dateutil.tz.tzlocal())
                    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')
                    exp_name = exp_prefix + '{}scale_{}agg_{}pl_PRE{}_seed{}_{}'.format(
예제 #11
0
def run_task(*_):
    """Implement the run_task method needed to run experiments with rllab."""
    V_ENTER = 30
    INNER_LENGTH = 300
    LONG_LENGTH = 100
    SHORT_LENGTH = 300
    N_ROWS = 3
    N_COLUMNS = 3
    NUM_CARS_LEFT = 1
    NUM_CARS_RIGHT = 1
    NUM_CARS_TOP = 1
    NUM_CARS_BOT = 1
    tot_cars = (NUM_CARS_LEFT + NUM_CARS_RIGHT) * N_COLUMNS \
        + (NUM_CARS_BOT + NUM_CARS_TOP) * N_ROWS

    grid_array = {
        "short_length": SHORT_LENGTH,
        "inner_length": INNER_LENGTH,
        "long_length": LONG_LENGTH,
        "row_num": N_ROWS,
        "col_num": N_COLUMNS,
        "cars_left": NUM_CARS_LEFT,
        "cars_right": NUM_CARS_RIGHT,
        "cars_top": NUM_CARS_TOP,
        "cars_bot": NUM_CARS_BOT
    }

    sim_params = SumoParams(sim_step=1, render=True)

    vehicles = VehicleParams()
    vehicles.add(veh_id="idm",
                 acceleration_controller=(SimCarFollowingController, {}),
                 car_following_params=SumoCarFollowingParams(
                     min_gap=2.5,
                     tau=1.1,
                     max_speed=V_ENTER,
                     speed_mode="all_checks"),
                 routing_controller=(GridRouter, {}),
                 num_vehicles=tot_cars)

    tl_logic = TrafficLightParams(baseline=False)

    additional_env_params = {
        "target_velocity": 50,
        "switch_time": 3.0,
        "num_observed": 2,
        "discrete": False,
        "tl_type": "controlled"
    }
    env_params = EnvParams(additional_params=additional_env_params)

    additional_net_params = {
        "speed_limit": 35,
        "grid_array": grid_array,
        "horizontal_lanes": 1,
        "vertical_lanes": 1
    }

    if USE_INFLOWS:
        initial_config, net_params = get_flow_params(
            v_enter=V_ENTER,
            vehs_per_hour=EDGE_INFLOW,
            col_num=N_COLUMNS,
            row_num=N_ROWS,
            add_net_params=additional_net_params)
    else:
        initial_config, net_params = get_non_flow_params(
            V_ENTER, additional_net_params)

    scenario = SimpleGridScenario(name="grid-intersection",
                                  vehicles=vehicles,
                                  net_params=net_params,
                                  initial_config=initial_config,
                                  traffic_lights=tl_logic)

    env_name = "PO_TrafficLightGridEnv"
    pass_params = (env_name, sim_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    horizon = env.horizon
    env = normalize(env)

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=40000,
        max_path_length=horizon,
        # whole_paths=True,
        n_itr=800,
        discount=0.999,
        # step_size=0.01,
    )
    algo.train()
예제 #12
0
        hidden_sizes=layer_size,
        is_protagonist=False
    )
    adv_baseline = LinearFeatureBaseline(env_spec=env.spec)

    ## Initializing the parallel sampler ##
    parallel_sampler.initialize(n_process)

    ## Optimizer for the Protagonist ##
    pro_algo = TRPO(
        env=env,
        pro_policy=pro_policy,
        adv_policy=adv_policy,
        pro_baseline=pro_baseline,
        adv_baseline=adv_baseline,
        batch_size=batch_size,
        max_path_length=path_length,
        n_itr=n_pro_itr,
        discount=0.995,
        gae_lambda=gae_lambda,
        step_size=step_size,
        is_protagonist=True
    )

    ## Optimizer for the Adversary ##
    adv_algo = TRPO(
        env=env,
        pro_policy=pro_policy,
        adv_policy=adv_policy,
        pro_baseline=pro_baseline,
        adv_baseline=adv_baseline,
        batch_size=batch_size,
    folder_name = 'cartpole_split_sanitycheck'
    segmentation_num = 2
    load_path_from_file = False
    load_metric_from_file = False
    split_percentage = 0.2


    # generate data
    baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=10000,
        max_path_length=env.horizon,
        n_itr=5,

        discount=0.995,
        step_size=0.01,
        gae_lambda=0.97,
    )

    algo.init_opt()
    if not load_path_from_file:
        init_param = policy.get_param_values()
        init_param_obj = copy.deepcopy(policy.get_params())

        from rllab.sampler import parallel_sampler

        parallel_sampler.initialize(n_parallel=2)
예제 #14
0
def average_error(env, policy, batch_size, gt_gradient):
    np.random.seed(0)

    baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0)

    init_param = policy.get_param_values()

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=batch_size,
        max_path_length=env.horizon,
        n_itr=5,
        discount=0.995,
        step_size=0.01,
        gae_lambda=0.97,
    )

    gradients_vanilla = []
    gradients_randwalk = []

    gradient_error_vanilla = []
    gradient_error_randwalk = []

    env.wrapped_env.env.env.perturb_MP = True
    algo.start_worker()
    algo.init_opt()
    for i in range(20):
        policy.set_param_values(init_param)  # reset the policy parameters
        paths = algo.sampler.obtain_samples(0)
        samples_data = algo.sampler.process_samples(0, paths)
        samples_data = algo.sampler.process_samples(0, paths)
        grad = get_gradient(algo, samples_data)

        gradients_randwalk.append(grad)

        gradient_error_randwalk.append(np.linalg.norm(grad - gt_gradient))

    algo.shutdown_worker()

    env.wrapped_env.env.env.perturb_MP = False
    algo.start_worker()
    algo.init_opt()
    for i in range(20):
        policy.set_param_values(init_param)  # reset the policy parameters
        paths = algo.sampler.obtain_samples(0)
        samples_data = algo.sampler.process_samples(0, paths)
        samples_data = algo.sampler.process_samples(0, paths)
        grad = get_gradient(algo, samples_data)

        gradients_vanilla.append(grad)

        gradient_error_vanilla.append(np.linalg.norm(grad - gt_gradient))

    algo.shutdown_worker()

    print(np.std(gradients_vanilla, axis=0).shape)
    print(np.linalg.norm(np.mean(gradients_vanilla, axis=0)),
          np.mean(np.std(gradients_vanilla, axis=0)))
    print(np.mean(gradient_error_vanilla))

    print('randwalk')
    print(np.linalg.norm(np.mean(gradients_randwalk, axis=0)),
          np.mean(np.std(gradients_randwalk, axis=0)))
    print(np.mean(gradient_error_randwalk))

    return np.mean(gradient_error_vanilla), np.mean(gradient_error_randwalk)
예제 #15
0
from madrl_environments import StandardizedEnv
from madrl_environments.pursuit import MAWaterWorld
from rllabwrapper import RLLabEnv

from rllab.algos.trpo import TRPO
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab.envs.normalized_env import normalize
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
from rllab.policies.gaussian_gru_policy import GaussianGRUPolicy

env = StandardizedEnv(MAWaterWorld(3, 10, 2, 5))
env = RLLabEnv(env)

policy = GaussianGRUPolicy(env_spec=env.spec, hidden_sizes=(32,))

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(env=env,
            policy=policy,
            baseline=baseline,
            batch_size=8000,
            max_path_length=200,
            n_itr=500,
            discount=0.99,
            step_size=0.01,
            mode='decentralized',)

algo.train()
예제 #16
0
from rllab.algos.trpo import TRPO
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from examples.point_env import PointEnv
from rllab.envs.normalized_env import normalize
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy

env = normalize(PointEnv())
policy = GaussianMLPPolicy(
    env_spec=env.spec,
)
baseline = LinearFeatureBaseline(env_spec=env.spec)
algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
)
algo.train()
예제 #17
0
파일: run.py 프로젝트: T3p/baselines
def train(env, policy, policy_init, num_episodes, episode_cap, horizon,
          **alg_args):

    if env.startswith('rllab.'):
        # Get env name and class
        env_name = re.match('rllab.(\S+)', env).group(1)
        env_rllab_class = rllab_env_from_name(env_name)
        env = normalize(env_rllab_class())
    else:
        raise Exception('Only working for RLLAB envs')

    # Policy initialization
    if policy_init == 'zeros':
        initializer = LI.Constant(0)
    elif policy_init == 'normal':
        initializer = LI.Normal()
    else:
        raise Exception('Unrecognized policy initialization.')

    # Setting the policy type
    if policy == 'linear':
        hidden_sizes = tuple()
    elif policy == 'simple-nn':
        hidden_sizes = [16]
    else:
        raise Exception('NOT IMPLEMENTED.')

    # Creating the policy
    obs_dim = env.observation_space.flat_dim
    action_dim = env.action_space.flat_dim
    mean_network = MLP(
        input_shape=(obs_dim, ),
        output_dim=action_dim,
        hidden_sizes=hidden_sizes,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        output_b_init=None,
        output_W_init=initializer,
    )
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=hidden_sizes,
        mean_network=mean_network,
        log_weights=True,
    )

    # Creating baseline
    baseline = LinearFeatureBaseline(env_spec=env.spec)

    # Adding max_episodes constraint. If -1, this is unbounded
    if episode_cap:
        alg_args['max_episodes'] = num_episodes

    # Run algorithm
    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=horizon * num_episodes,
                whole_paths=True,
                max_path_length=horizon,
                **alg_args)
    algo.train()

    print('----- ENDING ------')
    print(policy.get_param_values())
예제 #18
0
def run_task(*_):
    """Implement the run_task method needed to run experiments with rllab."""
    sim_params = SumoParams(sim_step=0.2, render=True)

    # note that the vehicles are added sequentially by the scenario,
    # so place the merging vehicles after the vehicles in the ring
    vehicles = VehicleParams()
    # Inner ring vehicles
    vehicles.add(veh_id="human",
                 acceleration_controller=(IDMController, {
                     "noise": 0.2
                 }),
                 lane_change_controller=(SimLaneChangeController, {}),
                 routing_controller=(ContinuousRouter, {}),
                 num_vehicles=6,
                 car_following_params=SumoCarFollowingParams(minGap=0.0,
                                                             tau=0.5),
                 lane_change_params=SumoLaneChangeParams())

    # A single learning agent in the inner ring
    vehicles.add(veh_id="rl",
                 acceleration_controller=(RLController, {}),
                 lane_change_controller=(SimLaneChangeController, {}),
                 routing_controller=(ContinuousRouter, {}),
                 num_vehicles=1,
                 car_following_params=SumoCarFollowingParams(
                     minGap=0.01, tau=0.5, speed_mode="no_collide"),
                 lane_change_params=SumoLaneChangeParams())

    # Outer ring vehicles
    vehicles.add(veh_id="merge-human",
                 acceleration_controller=(IDMController, {
                     "noise": 0.2
                 }),
                 lane_change_controller=(SimLaneChangeController, {}),
                 routing_controller=(ContinuousRouter, {}),
                 num_vehicles=10,
                 car_following_params=SumoCarFollowingParams(minGap=0.0,
                                                             tau=0.5),
                 lane_change_params=SumoLaneChangeParams())

    env_params = EnvParams(horizon=HORIZON,
                           additional_params={
                               "max_accel": 3,
                               "max_decel": 3,
                               "target_velocity": 10,
                               "n_preceding": 2,
                               "n_following": 2,
                               "n_merging_in": 2,
                           })

    additional_net_params = ADDITIONAL_NET_PARAMS.copy()
    additional_net_params["ring_radius"] = 50
    additional_net_params["inner_lanes"] = 1
    additional_net_params["outer_lanes"] = 1
    additional_net_params["lane_length"] = 75
    net_params = NetParams(no_internal_links=False,
                           additional_params=additional_net_params)

    initial_config = InitialConfig(x0=50,
                                   spacing="uniform",
                                   additional_params={"merge_bunching": 0})

    scenario = TwoLoopsOneMergingScenario(name=exp_tag,
                                          vehicles=vehicles,
                                          net_params=net_params,
                                          initial_config=initial_config)

    env_name = "TwoLoopsMergePOEnv"
    pass_params = (env_name, sim_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    horizon = env.horizon
    env = normalize(env)

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(100, 50, 25))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=64 * 3 * horizon,
        max_path_length=horizon,
        # whole_paths=True,
        n_itr=1000,
        discount=0.999,
        # step_size=0.01,
    )
    algo.train()
예제 #19
0
        """
        Returns a Space object
        """
        low = np.array(
            [0, -np.pi / 2, -np.pi / 2, 0, -np.pi, -np.pi, 0, -np.pi, -np.pi])
        high = np.array([
            100, np.pi / 2, np.pi / 2, 1000, np.pi, np.pi, 1000, np.pi, -np.pi
        ])
        return Box(low=low, high=high)

    def log_diagnostics(self, paths):
        pass


if __name__ == "__main__":
    from rllab.algos.trpo import TRPO
    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
    from rllab.envs.normalized_env import normalize
    from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy

    env = normalize(FlightEnv())
    policy = GaussianMLPPolicy(env_spec=env.spec, )
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                max_path_length=400,
                batch_size=4000,
                gae_lambda=0.7)
    algo.train()
예제 #20
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 0 if 'sampling_res' not in v.keys() else v['sampling_res']

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    tf_session = tf.Session()

    inner_env = normalize(AntEnv())

    uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'],
                                                   bounds=v['goal_range'],
                                                   center=v['goal_center'])
    env = GoalExplorationEnv(
        env=inner_env,
        goal_generator=uniform_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        append_transformed_obs=v['append_transformed_obs'],
        append_extra_info=v['append_extra_info'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    if v['baseline'] == 'g_mlp':
        baseline = GaussianMLPBaseline(env_spec=env.spec)

    # initialize all logging arrays on itr0
    outer_iter = 0
    logger.log('Generating the Initial Heatmap...')
    test_and_plot_policy(policy,
                         env,
                         max_reward=v['max_reward'],
                         sampling_res=sampling_res,
                         n_traj=v['n_traj'],
                         itr=outer_iter,
                         report=report,
                         limit=v['goal_range'],
                         center=v['goal_center'],
                         bounds=v['goal_range'])

    # GAN
    logger.log("Instantiating the GAN...")
    gan_configs = {key[4:]: value for key, value in v.items() if 'GAN_' in key}
    for key, value in gan_configs.items():
        if value is tf.train.AdamOptimizer:
            gan_configs[key] = tf.train.AdamOptimizer(gan_configs[key +
                                                                  '_stepSize'])
        if value is tflearn.initializations.truncated_normal:
            gan_configs[key] = tflearn.initializations.truncated_normal(
                stddev=gan_configs[key + '_stddev'])

    gan = StateGAN(
        state_size=v['goal_size'],
        evaluater_size=v['num_labels'],
        state_range=v['goal_range'],
        state_center=v['goal_center'],
        state_noise_level=v['goal_noise_level'],
        generator_layers=v['gan_generator_layers'],
        discriminator_layers=v['gan_discriminator_layers'],
        noise_size=v['gan_noise_size'],
        tf_session=tf_session,
        configs=gan_configs,
    )

    # log first samples form the GAN
    initial_goals, _ = gan.sample_states_with_noise(v['num_new_goals'])

    logger.log("Labeling the goals")
    labels = label_states(initial_goals,
                          env,
                          policy,
                          v['horizon'],
                          n_traj=v['n_traj'],
                          key='goal_reached')

    plot_labeled_states(initial_goals,
                        labels,
                        report=report,
                        itr=outer_iter,
                        limit=v['goal_range'],
                        center=v['goal_center'])
    report.new_row()

    all_goals = StateCollection(distance_threshold=v['coll_eps'])

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        feasible_goals = generate_initial_goals(env,
                                                policy,
                                                v['goal_range'],
                                                goal_center=v['goal_center'],
                                                horizon=v['horizon'])
        labels = np.ones((feasible_goals.shape[0],
                          2)).astype(np.float32)  # make them all good goals
        plot_labeled_states(feasible_goals,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'],
                            summary_string_base='On-policy Goals:\n')
        if v['only_on_policy']:
            goals = feasible_goals[np.random.choice(
                feasible_goals.shape[0], v['num_new_goals'], replace=False), :]
        else:
            logger.log("Training the GAN")
            gan.pretrain(feasible_goals, v['gan_outer_iters'])
            # Sample GAN
            logger.log("Sampling goals from the GAN")
            raw_goals, _ = gan.sample_states_with_noise(v['num_new_goals'])

            if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0:
                old_goals = all_goals.sample(v['num_old_goals'])
                goals = np.vstack([raw_goals, old_goals])
            else:
                goals = raw_goals

        with ExperimentLogger(log_dir,
                              'last',
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment goal generator")
            env.update_goal_generator(
                UniformListStateGenerator(
                    goals.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                plot=False,
            )

            trpo_paths = algo.train()

        if v['use_trpo_paths']:
            logger.log("labeling starts with trpo rollouts")
            [goals, labels] = label_states_from_paths(
                trpo_paths,
                n_traj=2,
                key='goal_reached',  # using the min n_traj
                as_goal=True,
                env=env)
            paths = [path for paths in trpo_paths for path in paths]
        else:
            logger.log("labeling starts manually")
            labels, paths = label_states(goals,
                                         env,
                                         policy,
                                         v['horizon'],
                                         as_goals=True,
                                         n_traj=v['n_traj'],
                                         key='goal_reached',
                                         full_path=True)

        with logger.tabular_prefix("OnStarts_"):
            env.log_diagnostics(paths)

        logger.log('Generating the Heatmap...')
        test_and_plot_policy(policy,
                             env,
                             max_reward=v['max_reward'],
                             sampling_res=sampling_res,
                             n_traj=v['n_traj'],
                             itr=outer_iter,
                             report=report,
                             limit=v['goal_range'],
                             center=v['goal_center'],
                             bounds=v['goal_range'])

        plot_labeled_states(goals,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'])

        logger.dump_tabular(with_prefix=False)
        report.new_row()

        # append new goals to list of all goals (replay buffer): Not the low reward ones!!
        filtered_raw_goals = [
            goal for goal, label in zip(goals, labels) if label[0] == 1
        ]  # this is not used if no replay buffer
        all_goals.append(filtered_raw_goals)

        if v['add_on_policy']:
            logger.log("sampling on policy")
            feasible_goals = generate_initial_goals(
                env,
                policy,
                v['goal_range'],
                goal_center=v['goal_center'],
                horizon=v['horizon'])
            # downsampled_feasible_goals = feasible_goals[np.random.choice(feasible_goals.shape[0], v['add_on_policy']),:]
            all_goals.append(feasible_goals)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('env_fname',
                        type=str,
                        help='config file with environment arguments')
    parser.add_argument('transformers_fname', type=str)
    parser.add_argument('mean_network_type',
                        type=str,
                        choices=['conv', 'siamese'])
    parser.add_argument('--conv_filters',
                        nargs='*',
                        type=int,
                        default=[16, 32])
    parser.add_argument('--hidden_sizes', nargs='*', type=int, default=[16])
    parser.add_argument('--init_std', type=float, default=1.0)
    parser.add_argument('--n_itr', type=int, default=100)
    parser.add_argument('--step_size', type=float, default=0.01)
    parser.add_argument('--batch_size', type=int, default=10000)
    parser.add_argument('--use_static_car', action='store_true')
    parser.add_argument('--use_init_heuristic', action='store_true')
    args = parser.parse_args()

    with open(args.env_fname) as yaml_string:
        env_config = yaml.load(yaml_string)
        if issubclass(env_config['class'], envs.RosEnv):
            import rospy
            rospy.init_node("generate_data")
        env = from_config(env_config)

    if args.use_static_car:
        env.car_env.speed_offset_space.low = \
        env.car_env.speed_offset_space.high = np.array([0.0, 4.0])

    # transformers
    with open(args.transformers_fname) as transformers_file:
        transformers_config = yaml.load(transformers_file)
    transformers = dict()
    for data_name, transformer_config in transformers_config.items():
        if data_name == 'action':
            replace_config = {'space': env.action_space}
        elif data_name in env.observation_space.spaces:
            replace_config = {'space': env.observation_space.spaces[data_name]}
        else:
            replace_config = {}
        transformers[data_name] = from_config(transformers_config[data_name],
                                              replace_config=replace_config)

    env = ServoingEnv(env)
    env = RllabEnv(env, transformers=transformers)
    env = normalize(env)

    network_kwargs = dict(
        input_shape=env.observation_space.shape,
        output_dim=env.action_space.flat_dim,
        conv_filters=args.conv_filters,
        conv_filter_sizes=[3] * len(args.conv_filters),
        conv_strides=[2] * len(args.conv_filters),
        conv_pads=[0] * len(args.conv_filters),
        hidden_sizes=args.hidden_sizes,
        hidden_nonlinearity=LN.rectify,
        output_nonlinearity=None,
        name="mean_network",
    )
    if args.mean_network_type == 'conv':
        mean_network = ConvNetwork(**network_kwargs)
    elif args.mean_network_type == 'siamese':
        mean_network = SiameseQuadraticErrorNetwork(**network_kwargs)
    else:
        raise NotImplementedError

    policy = GaussianConvPolicy(
        env_spec=env.spec,
        init_std=args.init_std,
        mean_network=mean_network,
    )
    if args.use_init_heuristic:
        W_var = policy.get_params()[0]
        W = W_var.get_value()
        W[:, 3:, :, :] = -W[:, :3, :, :]
        W_var.set_value(W)
    baseline = GaussianConvBaseline(
        env_spec=env.spec,
        regressor_args=dict(
            use_trust_region=True,
            step_size=args.step_size,
            normalize_inputs=True,
            normalize_outputs=True,
            hidden_sizes=args.hidden_sizes,
            conv_filters=args.conv_filters,
            conv_filter_sizes=[3] * len(args.conv_filters),
            conv_strides=[2] * len(args.conv_filters),
            conv_pads=[0] * len(args.conv_filters),
            batchsize=args.batch_size * 10,
        ))

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=args.batch_size,
        max_path_length=100,
        n_itr=args.n_itr,
        discount=0.9,
        step_size=args.step_size,
    )
    algo.train()
    import IPython as ipy
    ipy.embed()
예제 #22
0
def run_task(vv, log_dir=None, exp_name=None):
    global policy
    global baseline

    trpo_stepsize = 0.01
    trpo_subsample_factor = 0.2

    # Check if variant is available
    if vv['model_type'] not in ['BrushTireModel', 'LinearTireModel']:
        raise ValueError('Unrecognized model type for simulating robot')
    if vv['robot_type'] not in ['MRZR', 'RCCar']:
        raise ValueError('Unrecognized robot type')

    # Load environment
    if not vv['use_ros']:
        env = StraightEnv(target_velocity=vv['target_velocity'],
                          dt=vv['dt'],
                          model_type=vv['model_type'],
                          robot_type=vv['robot_type'],
                          mu_s=vv['mu_s'],
                          mu_k=vv['mu_k'])
    else:
        from aa_simulation.envs.straight.straight_env_ros import StraightEnvROS
        env = StraightEnvROS(target_velocity=vv['target_velocity'],
                             dt=vv['dt'],
                             model_type=vv['model_type'],
                             robot_type=vv['robot_type'])

    # Save variant information for comparison plots
    variant_file = logger.get_snapshot_dir() + '/variant.json'
    logger.log_variant(variant_file, vv)

    # Set variance for each action component separately for exploration
    # Note: We set the variance manually because we are not scaling our
    #       action space during training.
    init_std_speed = vv['target_velocity'] / 4
    init_std_steer = np.pi / 6
    init_std = [init_std_speed, init_std_steer]

    # Build policy and baseline networks
    # Note: Mean of policy network set to analytically computed values for
    #       faster training (rough estimates for RL to fine-tune).
    if policy is None or baseline is None:
        target_velocity = vv['target_velocity']
        target_steering = 0
        output_mean = np.array([target_velocity, target_steering])
        hidden_sizes = (32, 32)

        # In mean network, allow output b values to dominate final output
        # value by constraining the magnitude of the output W matrix. This is
        # to allow faster learning. These numbers are arbitrarily chosen.
        W_gain = min(vv['target_velocity'] / 5, np.pi / 15)

        mean_network = MLP(input_shape=(env.spec.observation_space.flat_dim, ),
                           output_dim=env.spec.action_space.flat_dim,
                           hidden_sizes=hidden_sizes,
                           hidden_nonlinearity=LN.tanh,
                           output_nonlinearity=None,
                           output_W_init=LI.GlorotUniform(gain=W_gain),
                           output_b_init=output_mean)
        policy = GaussianMLPPolicy(env_spec=env.spec,
                                   hidden_sizes=(32, 32),
                                   init_std=init_std,
                                   mean_network=mean_network)
        baseline = LinearFeatureBaseline(env_spec=env.spec,
                                         target_key='returns')

    # Reset variance to re-enable exploration when using pre-trained networks
    else:
        policy._l_log_std = ParamLayer(
            policy._mean_network.input_layer,
            num_units=env.spec.action_space.flat_dim,
            param=LI.Constant(np.log(init_std)),
            name='output_log_std',
            trainable=True)
        obs_var = policy._mean_network.input_layer.input_var
        mean_var, log_std_var = L.get_output(
            [policy._l_mean, policy._l_log_std])
        policy._log_std_var = log_std_var
        LasagnePowered.__init__(policy, [policy._l_mean, policy._l_log_std])
        policy._f_dist = ext.compile_function(inputs=[obs_var],
                                              outputs=[mean_var, log_std_var])

    safety_baseline = LinearFeatureBaseline(env_spec=env.spec,
                                            target_key='safety_returns')

    safety_constraint = StraightSafetyConstraint(max_value=1.0,
                                                 baseline=safety_baseline)

    if vv['algo'] == 'TRPO':
        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=600,
            max_path_length=env.horizon,
            n_itr=600,
            discount=0.99,
            step_size=trpo_stepsize,
            plot=False,
        )
    else:
        algo = CPO(env=env,
                   policy=policy,
                   baseline=baseline,
                   safety_constraint=safety_constraint,
                   batch_size=600,
                   max_path_length=env.horizon,
                   n_itr=600,
                   discount=0.99,
                   step_size=trpo_stepsize,
                   gae_lambda=0.95,
                   safety_gae_lambda=1,
                   optimizer_args={'subsample_factor': trpo_subsample_factor},
                   plot=False)
    algo.train()
예제 #23
0
mdp = NormalizedEnv(CartpoleSwingupEnvX())
for seed in seeds:

    policy = GaussianMLPPolicy(
        env_spec=mdp.spec,
        hidden_sizes=(64, 32),
    )

    baseline = LinearFeatureBaseline(mdp.spec, )

    batch_size = 50000
    algo = TRPO(
        env=mdp,
        policy=policy,
        baseline=baseline,
        batch_size=batch_size,
        whole_paths=True,
        max_path_length=500,
        n_itr=10000,
        step_size=0.01,
        subsample_factor=1.0,
    )

    run_experiment_lite(algo.train(),
                        exp_prefix="trpo",
                        n_parallel=4,
                        snapshot_mode="last",
                        seed=seed,
                        mode="local")
예제 #24
0
def run_task(_):
    """Implement the run_task method needed to run experiments with rllab."""
    sumo_params = SumoParams(sumo_binary="sumo-gui",
                             sim_step=0.2,
                             restart_instance=True)

    # RL vehicles constitute 5% of the total number of vehicles
    vehicles = Vehicles()
    vehicles.add(veh_id="human",
                 acceleration_controller=(IDMController, {
                     "noise": 0.2
                 }),
                 speed_mode="no_collide",
                 num_vehicles=5)
    vehicles.add(veh_id="rl",
                 acceleration_controller=(RLController, {}),
                 speed_mode="no_collide",
                 num_vehicles=0)

    # Vehicles are introduced from both sides of merge, with RL vehicles
    # entering from the highway portion as well
    inflow = InFlows()
    inflow.add(veh_type="human",
               edge="inflow_highway",
               vehs_per_hour=(1 - RL_PENETRATION) * FLOW_RATE,
               departLane="free",
               departSpeed=10)
    inflow.add(veh_type="rl",
               edge="inflow_highway",
               vehs_per_hour=RL_PENETRATION * FLOW_RATE,
               departLane="free",
               departSpeed=10)
    inflow.add(veh_type="human",
               edge="inflow_merge",
               vehs_per_hour=100,
               departLane="free",
               departSpeed=7.5)

    additional_env_params = {
        "target_velocity": 25,
        "num_rl": NUM_RL,
        "max_accel": 1.5,
        "max_decel": 1.5
    }
    env_params = EnvParams(horizon=HORIZON,
                           sims_per_step=5,
                           warmup_steps=0,
                           additional_params=additional_env_params)

    additional_net_params = ADDITIONAL_NET_PARAMS.copy()
    additional_net_params["merge_lanes"] = 1
    additional_net_params["highway_lanes"] = 1
    additional_net_params["pre_merge_length"] = 500
    net_params = NetParams(in_flows=inflow,
                           no_internal_links=False,
                           additional_params=additional_net_params)

    initial_config = InitialConfig(spacing="uniform",
                                   lanes_distribution=float("inf"))

    scenario = MergeScenario(name="merge-rl",
                             generator_class=MergeGenerator,
                             vehicles=vehicles,
                             net_params=net_params,
                             initial_config=initial_config)

    env_name = "WaveAttenuationMergePOEnv"
    pass_params = (env_name, sumo_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    env = normalize(env)

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(32, 32, 32),
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=HORIZON * N_ROLLOUTS,
        max_path_length=HORIZON,
        n_itr=1000,
        # whole_paths=True,
        discount=0.999,
    )
    algo.train(),
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(Arm3dKeyEnv(ctrl_cost_coeff=v['ctrl_cost_coeff']))

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    fixed_start_generator = FixedStateGenerator(state=v['start_goal'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=fixed_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[-1 * v['goal_size']:
                                       ],  # the goal are the last 9 coords
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        inner_weight=v['inner_weight'],
        goal_weight=v['goal_weight'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=v['policy_hidden_sizes'],
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    if v['baseline'] == 'linear':
        baseline = LinearFeatureBaseline(env_spec=env.spec)
    elif v['baseline'] == 'g_mlp':
        baseline = GaussianMLPBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=v['pg_batch_size'],
        max_path_length=v['horizon'],
        n_itr=v['inner_iters'],
        step_size=0.01,
        discount=v['discount'],
        plot=False,
    )

    # load the state collection from data_upload
    load_dir = 'data_upload/state_collections/'
    all_feasible_starts = pickle.load(
        open(
            osp.join(config.PROJECT_PATH, load_dir, 'all_feasible_states.pkl'),
            'rb'))
    # all_feasible_starts = pickle.load(
    #     open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_04_230000.pkl'), 'rb'))
    # all_feasible_starts = pickle.load(
    #     open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_states_med_rad4.pkl'), 'rb'))
    all_feasible_starts2 = pickle.load(
        open(
            osp.join(config.PROJECT_PATH, load_dir,
                     'key_all_feasible_states_min_rad4.pkl'), 'rb'))
    all_feasible_starts3 = pickle.load(
        open(
            osp.join(config.PROJECT_PATH, load_dir,
                     'key_all_feasible_states_max_rad2.pkl'), 'rb'))
    print("we have %d feasible starts" % all_feasible_starts.size)

    all_starts = StateCollection(distance_threshold=v['coll_eps'])
    brownian_starts = StateCollection(
        distance_threshold=v['regularize_starts'])

    logger.log(
        'Generating seed starts from the goal (horizon 10, subsample 600 of them)'
    )
    with algo.env.set_kill_outside(radius=v['kill_radius']):
        seed_starts = generate_starts(
            env,
            starts=[v['start_goal']],
            horizon=10,  # this is smaller as they are seeds!
            variance=v['brownian_variance'],
            subsample=v['num_new_starts'])  # , animated=True, speedup=10)

        # seed_starts = all_feasible_starts.states
        # with env.set_kill_outside(radius=0.4):
        # find_all_feasible_states(env, seed_starts, distance_threshold=0.1, brownian_variance=1, animate=False)

    # # show where these states are:
    # shuffled_starts = np.array(all_feasible_starts.state_list)
    # np.random.shuffle(shuffled_starts)
    # generate_starts(env, starts=shuffled_starts, horizon=100, variance=v['brownian_variance'],
    #                 zero_action=True, animated=True, speedup=10)

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        with algo.env.set_kill_outside(radius=v['kill_radius']):
            starts = generate_starts(algo.env,
                                     starts=seed_starts,
                                     horizon=v['brownian_horizon'],
                                     variance=v['brownian_variance'])
        # regularization of the brownian starts
        brownian_starts.empty()
        brownian_starts.append(starts)
        starts = brownian_starts.sample(size=v['num_new_starts'])

        if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0:
            old_starts = all_starts.sample(v['num_old_starts'])
            starts = np.vstack([starts, old_starts])

        with ExperimentLogger(log_dir,
                              50 * (outer_iter // 50 + 1),
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment start generator")
            algo.env.update_start_generator(
                UniformListStateGenerator(
                    starts.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))
            # algo.start_worker()

            logger.log("Training the algorithm")

            algo.current_itr = 0
            trpo_paths = algo.train(already_init=outer_iter > 1)

        # import pdb; pdb.set_trace()
        if v['use_trpo_paths']:
            logger.log("labeling starts with trpo rollouts")
            [starts, labels] = label_states_from_paths(
                trpo_paths,
                n_traj=2,
                key='goal_reached',  # using the min n_traj
                as_goal=False,
                env=algo.env)
            paths = [path for paths in trpo_paths for path in paths]
        else:
            logger.log("labeling starts manually")
            labels, paths = label_states(starts,
                                         algo.env,
                                         policy,
                                         v['horizon'],
                                         as_goals=False,
                                         n_traj=v['n_traj'],
                                         key='goal_reached',
                                         full_path=True)

        with logger.tabular_prefix("OnStarts_"):
            algo.env.log_diagnostics(paths)

        logger.record_tabular('brownian_starts', brownian_starts.size)

        start_classes, text_labels = convert_label(labels)
        total_starts = labels.shape[0]
        logger.record_tabular('GenStarts_evaluated', total_starts)
        start_class_frac = OrderedDict(
        )  # this needs to be an ordered dict!! (for the log tabular)
        for k in text_labels.keys():
            frac = np.sum(start_classes == k) / total_starts
            logger.record_tabular('GenStart_frac_' + text_labels[k], frac)
            start_class_frac[text_labels[k]] = frac

        labels = np.logical_and(labels[:, 0],
                                labels[:, 1]).astype(int).reshape((-1, 1))

        logger.log("Labeling on uniform starts")
        with logger.tabular_prefix("Uniform_4med_"):
            unif_starts = all_feasible_starts.sample(500)
            unif_starts = np.pad(unif_starts,
                                 ((0, v['start_size'] - unif_starts.shape[1])),
                                 'constant')
            mean_reward, paths = evaluate_states(unif_starts,
                                                 algo.env,
                                                 policy,
                                                 v['horizon'],
                                                 n_traj=1,
                                                 key='goal_reached',
                                                 as_goals=False,
                                                 full_path=True)
            algo.env.log_diagnostics(paths)
        # with logger.tabular_prefix("Uniform_4med_bis_"):
        #     unif_starts = all_feasible_starts.sample(200)
        #     unif_starts1bis = np.pad(unif_starts, ((0, v['start_size'] - unif_starts.shape[1])), 'constant')
        #     mean_reward1bis, paths1bis = evaluate_states(unif_starts1bis, algo.env, policy, v['horizon'], n_traj=1,
        #                                                  key='goal_reached', as_goals=False, full_path=True)
        #     algo.env.log_diagnostics(paths1bis)
        # with logger.tabular_prefix("Uniform_4min_"):
        #     unif_starts2 = all_feasible_starts2.sample(200)
        #     unif_starts2 = np.pad(unif_starts2, ((0, v['start_size'] - unif_starts2.shape[1])), 'constant')
        #     mean_reward2, paths2 = evaluate_states(unif_starts2, algo.env, policy, v['horizon'], n_traj=1,
        #                                            key='goal_reached', as_goals=False, full_path=True)
        #     algo.env.log_diagnostics(paths2)
        # with logger.tabular_prefix("Uniform_2max_"):
        #     unif_starts3 = all_feasible_starts3.sample(200)
        #     unif_starts3 = np.pad(unif_starts3, ((0, v['start_size'] - unif_starts3.shape[1])), 'constant')
        #     mean_reward3, paths3 = evaluate_states(unif_starts3, algo.env, policy, v['horizon'], n_traj=1,
        #                                            key='goal_reached', as_goals=False, full_path=True)
        #     algo.env.log_diagnostics(paths3)

        logger.dump_tabular(with_prefix=True)

        # append new states to list of all starts (replay buffer):
        logger.log("Appending good goals to replay and generating seeds")
        filtered_raw_starts = [
            start for start, label in zip(starts, labels) if label[0] == 1
        ]
        all_starts.append(filtered_raw_starts)

        if v['seed_with'] == 'only_goods':
            if len(filtered_raw_starts) > 0:
                seed_starts = filtered_raw_starts
            elif np.sum(start_classes == 0) > np.sum(
                    start_classes == 1):  # if more low reward than high reward
                seed_starts = all_starts.sample(
                    300)  # sample them from the replay
            else:  # add a tone of noise if all the states I had ended up being high_reward!
                with algo.env.set_kill_outside(radius=v['kill_radius']):
                    seed_starts = generate_starts(
                        algo.env,
                        starts=starts,
                        horizon=int(v['horizon'] * 10),
                        subsample=v['num_new_starts'],
                        variance=v['brownian_variance'] * 10)
        elif v['seed_with'] == 'all_previous':
            all_starts.append(starts)
            seed_starts = starts
        elif v['seed_with'] == 'on_policy':
            with algo.env.set_kill_outside(radius=v['kill_radius']):
                seed_starts = generate_starts(algo.env,
                                              policy,
                                              horizon=v['horizon'],
                                              subsample=v['num_new_starts'])
예제 #26
0
    hidden_sizes=(42, 42))

baseline = LinearFeatureBaseline(env_spec=env.spec)

vg = instrument.VariantGenerator()
vg.add("seed", [1, 2, 3, 4, 5])

variants = vg.variants()

for variant in variants:
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=90000,
        max_path_length=100,
        n_itr=100,
        discount=0.99,
        step_size=0.1,
        optimizer_args={'cg_iters': 100},
        plot=True,
    )

    run_experiment_lite(
        algo.train(),
        # Number of parallel workers for sampling
        n_parallel=8,
        plot=True,
        # Only keep the snapshot parameters for the last iteration
        snapshot_mode="last",
        # Specifies the seed for the experiment. If this is not provided, a random seed
        # will be used,
예제 #27
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res']

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    if log_dir is None:
        log_dir = "/home/davheld/repos/rllab_goal_rl/data/local/debug"
        debug = True
    else:
        debug = False

    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=5)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(PointMazeEnv(maze_id=v['maze_id']))

    uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'],
                                                   bounds=v['goal_range'],
                                                   center=v['goal_center'])
    env = GoalExplorationEnv(
        env=inner_env,
        goal_generator=uniform_goal_generator,
        #obs2goal_transform=lambda x: x[:int(len(x) / 2)],
        obs2goal_transform=lambda x: x[:v['goal_size']],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        only_feasible=v['only_feasible'],
        goal_weight=v['goal_weight'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    outer_iter = 0
    if not debug and not v['fast_mode']:
        logger.log('Generating the Initial Heatmap...')
        test_and_plot_policy(policy,
                             env,
                             max_reward=v['max_reward'],
                             sampling_res=sampling_res,
                             n_traj=v['n_traj'],
                             itr=outer_iter,
                             report=report,
                             limit=v['goal_range'],
                             center=v['goal_center'])

    report.new_row()

    sagg_riac = SaggRIAC(state_size=v['goal_size'],
                         state_range=v['goal_range'],
                         state_center=v['goal_center'],
                         max_goals=v['max_goals'],
                         max_history=v['max_history'])

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)

        raw_goals = sagg_riac.sample_states(num_samples=v['num_new_goals'])

        goals = raw_goals

        with ExperimentLogger(log_dir,
                              'last',
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment goal generator")
            env.update_goal_generator(
                UniformListStateGenerator(
                    goals,
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                discount=v['discount'],
                plot=False,
            )

            all_paths = algo.train()

        if v['use_competence_ratio']:
            [goals, rewards
             ] = compute_rewards_from_paths(all_paths,
                                            key='competence',
                                            as_goal=True,
                                            env=env,
                                            terminal_eps=v['terminal_eps'])
        else:
            [goals, rewards] = compute_rewards_from_paths(all_paths,
                                                          key='rewards',
                                                          as_goal=True,
                                                          env=env)

        [goals_with_labels,
         labels] = label_states_from_paths(all_paths,
                                           n_traj=v['n_traj'],
                                           key='goal_reached')
        plot_labeled_states(goals_with_labels,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'],
                            maze_id=v['maze_id'])

        logger.log('Generating the Heatmap...')
        test_and_plot_policy(policy,
                             env,
                             max_reward=v['max_reward'],
                             sampling_res=sampling_res,
                             n_traj=v['n_traj'],
                             itr=outer_iter,
                             report=report,
                             limit=v['goal_range'],
                             center=v['goal_center'])

        sagg_riac.plot_regions_interest(maze_id=v['maze_id'], report=report)
        sagg_riac.plot_regions_states(maze_id=v['maze_id'], report=report)

        logger.log("Updating SAGG-RIAC")
        sagg_riac.add_states(goals, rewards)

        # Find final states "accidentally" reached by the agent.
        final_goals = compute_final_states_from_paths(all_paths,
                                                      as_goal=True,
                                                      env=env)
        sagg_riac.add_accidental_states(final_goals, v['extend_dist_rew'])

        logger.dump_tabular(with_prefix=False)
        report.new_row()
예제 #28
0
# Created by Xingyu Lin, 10/06/2018
from rllab.algos.trpo import TRPO
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from envs.square2d.square2d_nongoal import Square2dEnv
from rllab.envs.normalized_env import normalize
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
from rllab.misc.instrument import run_experiment_lite

env = normalize(Square2dEnv())

policy = GaussianMLPPolicy(
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32))

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=100,
    n_itr=1000,
    discount=0.99,
    step_size=0.01,
)
algo.train()
예제 #29
0
        env = normalize(CartpoleEnv())

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            # The neural network policy should have two hidden layers, each with 32 hidden units.
            hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=4000,
            max_path_length=100,
            n_itr=40,
            discount=0.99,
            step_size=step_size,
            # Uncomment both lines (this and the plot parameter below) to enable plotting
            # plot=True,
        )

        run_experiment_lite(
            algo.train(),
            exp_prefix="first_exp",
            # Number of parallel workers for sampling
            n_parallel=1,
            # Only keep the snapshot parameters for the last iteration
            snapshot_mode="last",
            # Specifies the seed for the experiment. If this is not provided, a random seed
            # will be used
예제 #30
0
stub(globals())

# Param ranges
seeds = range(2)
# SwimmerGather hierarchical task
mdp_classes = [SwimmerGatherEnv]
mdps = [NormalizedEnv(env=mdp_class()) for mdp_class in mdp_classes]
param_cart_product = itertools.product(mdps, seeds)

for mdp, seed in param_cart_product:

    policy = GaussianMLPPolicy(env_spec=mdp.spec, hidden_sizes=(64, 32))

    baseline = LinearFeatureBaseline(mdp.spec)

    batch_size = 50000
    algo = TRPO(
        env=mdp,
        policy=policy,
        baseline=baseline,
        batch_size=batch_size,
        whole_paths=True,
        max_path_length=500,
        n_itr=10000,
        step_size=0.01,
        subsample_factor=1.0,
    )

    run_experiment_lite(algo.train(), exp_prefix="trpo", n_parallel=4, snapshot_mode="last", seed=seed, mode="local")
예제 #31
0
파일: simple.py 프로젝트: birdyLinch/rllab
                                       "mean_network": None,
                                       "hidden_sizes": (100, 50, 25),
                                       "hidden_nonlinearity": NL.tanh,
                                       "optimizer": base_line_optimizer,
                                       "use_trust_region": True,
                                       "step_size": 0.01,
                                       "learn_std": True,
                                       "init_std": 1.0,
                                       "adaptive_std": False,
                                       "std_share_network": False,
                                       "std_hidden_sizes": (32, 32),
                                       "std_nonlinearity": None,
                                       "normalize_inputs": True,
                                       "normalize_outputs": True,
                                   })

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        n_itr=total_iter,
        max_path_length=max_path_length,
        experiment_spec=experiment_spec,
        save_policy_every=save_policy_every,
        batch_size=batch_size,
        discount=0.995,
        gae_lambda=0.98,
    )

    algo.train(),
from rllab.policies.gaussian_gru_policy import GaussianGRUPolicy
from rllab.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp
from rllab.misc.instrument import stub, run_experiment_lite

stub(globals())

env = normalize(CartpoleEnv())

policy = GaussianGRUPolicy(
    env_spec=env.spec,
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=100,
    n_itr=10,
    discount=0.99,
    step_size=0.01,
    optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
)
run_experiment_lite(
    algo.train(),
    n_parallel=1,
    seed=1,
)
예제 #33
0
def main():
    now = datetime.datetime.now(dateutil.tz.tzlocal())
    rand_id = str(uuid.uuid4())[:5]
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z')
    default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id)

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--exp_name', type=str, default=default_exp_name, help='Name of the experiment.')

    parser.add_argument('--discount', type=float, default=0.99)
    parser.add_argument('--gae_lambda', type=float, default=1.0)
    parser.add_argument('--reward_scale', type=float, default=1.0)

    parser.add_argument('--n_iter', type=int, default=250)
    parser.add_argument('--sampler_workers', type=int, default=1)
    parser.add_argument('--max_traj_len', type=int, default=250)
    parser.add_argument('--update_curriculum', action='store_true', default=False)
    parser.add_argument('--n_timesteps', type=int, default=8000)
    parser.add_argument('--control', type=str, default='centralized')

    parser.add_argument('--rectangle', type=str, default='10,10')
    parser.add_argument('--map_type', type=str, default='rectangle')
    parser.add_argument('--n_evaders', type=int, default=5)
    parser.add_argument('--n_pursuers', type=int, default=2)
    parser.add_argument('--obs_range', type=int, default=3)
    parser.add_argument('--n_catch', type=int, default=2)
    parser.add_argument('--urgency', type=float, default=0.0)
    parser.add_argument('--pursuit', dest='train_pursuit', action='store_true')
    parser.add_argument('--evade', dest='train_pursuit', action='store_false')
    parser.set_defaults(train_pursuit=True)
    parser.add_argument('--surround', action='store_true', default=False)
    parser.add_argument('--constraint_window', type=float, default=1.0)
    parser.add_argument('--sample_maps', action='store_true', default=False)
    parser.add_argument('--map_file', type=str, default='../maps/map_pool.npy')
    parser.add_argument('--flatten', action='store_true', default=False)
    parser.add_argument('--reward_mech', type=str, default='global')
    parser.add_argument('--catchr', type=float, default=0.1)
    parser.add_argument('--term_pursuit', type=float, default=5.0)

    parser.add_argument('--recurrent', type=str, default=None)
    parser.add_argument('--policy_hidden_sizes', type=str, default='128,128')
    parser.add_argument('--baselin_hidden_sizes', type=str, default='128,128')
    parser.add_argument('--baseline_type', type=str, default='linear')

    parser.add_argument('--conv', action='store_true', default=False)

    parser.add_argument('--max_kl', type=float, default=0.01)

    parser.add_argument('--log_dir', type=str, required=False)
    parser.add_argument('--tabular_log_file', type=str, default='progress.csv',
                        help='Name of the tabular log file (in csv).')
    parser.add_argument('--text_log_file', type=str, default='debug.log',
                        help='Name of the text log file (in pure text).')
    parser.add_argument('--params_log_file', type=str, default='params.json',
                        help='Name of the parameter log file (in json).')
    parser.add_argument('--seed', type=int,
                        help='Random seed for numpy')
    parser.add_argument('--args_data', type=str,
                        help='Pickled data for stub objects')
    parser.add_argument('--snapshot_mode', type=str, default='all',
                        help='Mode to save the snapshot. Can be either "all" '
                             '(all iterations will be saved), "last" (only '
                             'the last iteration will be saved), or "none" '
                             '(do not save snapshots)')
    parser.add_argument('--log_tabular_only', type=ast.literal_eval, default=False,
                        help='Whether to only print the tabular log information (in a horizontal format)')


    args = parser.parse_args()

    parallel_sampler.initialize(n_parallel=args.sampler_workers)

    if args.seed is not None:
        set_seed(args.seed)
        parallel_sampler.set_seed(args.seed)

    args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(',')))

    if args.sample_maps:
        map_pool = np.load(args.map_file)
    else:
        if args.map_type == 'rectangle':
            env_map = TwoDMaps.rectangle_map(*map(int, args.rectangle.split(',')))
        elif args.map_type == 'complex':
            env_map = TwoDMaps.complex_map(*map(int, args.rectangle.split(',')))
        else:
            raise NotImplementedError()
        map_pool = [env_map]

    env = PursuitEvade(map_pool, n_evaders=args.n_evaders, n_pursuers=args.n_pursuers,
                       obs_range=args.obs_range, n_catch=args.n_catch,
                       train_pursuit=args.train_pursuit, urgency_reward=args.urgency,
                       surround=args.surround, sample_maps=args.sample_maps,
                       constraint_window=args.constraint_window,
                       flatten=args.flatten,
                       reward_mech=args.reward_mech,
                       catchr=args.catchr,
                       term_pursuit=args.term_pursuit)

    env = RLLabEnv(
            StandardizedEnv(env, scale_reward=args.reward_scale, enable_obsnorm=False),
            mode=args.control)

    if args.recurrent:
        if args.conv:
            feature_network = ConvNetwork(
                input_shape=emv.spec.observation_space.shape,
                output_dim=5, 
                conv_filters=(8,16,16),
                conv_filter_sizes=(3,3,3),
                conv_strides=(1,1,1),
                conv_pads=('VALID','VALID','VALID'),
                hidden_sizes=(64,), 
                hidden_nonlinearity=NL.rectify,
                output_nonlinearity=NL.softmax)
        else:
            feature_network = MLP(
                input_shape=(env.spec.observation_space.flat_dim + env.spec.action_space.flat_dim,),
                output_dim=5, hidden_sizes=(128,128,128), hidden_nonlinearity=NL.tanh,
                output_nonlinearity=None)
        if args.recurrent == 'gru':
            policy = CategoricalGRUPolicy(env_spec=env.spec, feature_network=feature_network,
                                       hidden_dim=int(args.policy_hidden_sizes))
    elif args.conv:
        feature_network = ConvNetwork(
            input_shape=env.spec.observation_space.shape,
            output_dim=5, 
            conv_filters=(8,16,16),
            conv_filter_sizes=(3,3,3),
            conv_strides=(1,1,1),
            conv_pads=('valid','valid','valid'),
            hidden_sizes=(64,), 
            hidden_nonlinearity=NL.rectify,
            output_nonlinearity=NL.softmax)
        policy = CategoricalMLPPolicy(env_spec=env.spec, prob_network=feature_network)
    else:
        policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=args.hidden_sizes)

    if args.baseline_type == 'linear':
        baseline = LinearFeatureBaseline(env_spec=env.spec)
    else:
        baseline = ZeroBaseline(obsfeat_space)

    # logger
    default_log_dir = config.LOG_DIR
    if args.log_dir is None:
        log_dir = osp.join(default_log_dir, args.exp_name)
    else:
        log_dir = args.log_dir
    tabular_log_file = osp.join(log_dir, args.tabular_log_file)
    text_log_file = osp.join(log_dir, args.text_log_file)
    params_log_file = osp.join(log_dir, args.params_log_file)

    logger.log_parameters_lite(params_log_file, args)
    logger.add_text_output(text_log_file)
    logger.add_tabular_output(tabular_log_file)
    prev_snapshot_dir = logger.get_snapshot_dir()
    prev_mode = logger.get_snapshot_mode()
    logger.set_snapshot_dir(log_dir)
    logger.set_snapshot_mode(args.snapshot_mode)
    logger.set_log_tabular_only(args.log_tabular_only)
    logger.push_prefix("[%s] " % args.exp_name)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=args.n_timesteps,
        max_path_length=args.max_traj_len,
        n_itr=args.n_iter,
        discount=args.discount,
        gae_lambda=args.gae_lambda,
        step_size=args.max_kl,
        mode=args.control,)

    algo.train()
예제 #34
0
def run_task(*_):
    """Implement the run_task method needed to run experiments with rllab."""
    v_enter = 10
    inner_length = 300
    long_length = 100
    short_length = 300
    n = 3
    m = 3
    num_cars_left = 1
    num_cars_right = 1
    num_cars_top = 1
    num_cars_bot = 1
    tot_cars = (num_cars_left + num_cars_right) * m \
        + (num_cars_bot + num_cars_top) * n

    grid_array = {
        "short_length": short_length,
        "inner_length": inner_length,
        "long_length": long_length,
        "row_num": n,
        "col_num": m,
        "cars_left": num_cars_left,
        "cars_right": num_cars_right,
        "cars_top": num_cars_top,
        "cars_bot": num_cars_bot
    }

    sumo_params = SumoParams(sim_step=1, render=True)

    vehicles = Vehicles()
    vehicles.add(veh_id="idm",
                 acceleration_controller=(SumoCarFollowingController, {}),
                 sumo_car_following_params=SumoCarFollowingParams(
                     min_gap=2.5, tau=1.1, max_speed=v_enter),
                 routing_controller=(GridRouter, {}),
                 num_vehicles=tot_cars,
                 speed_mode="all_checks")

    tl_logic = TrafficLights(baseline=False)

    additional_env_params = {
        "target_velocity": 50,
        "switch_time": 3.0,
        "num_observed": 2,
        "discrete": False,
        "tl_type": "controlled"
    }
    env_params = EnvParams(additional_params=additional_env_params)

    additional_net_params = {
        "speed_limit": 35,
        "grid_array": grid_array,
        "horizontal_lanes": 1,
        "vertical_lanes": 1
    }

    initial_config, net_params = get_flow_params(10, 300, n, m,
                                                 additional_net_params)

    scenario = SimpleGridScenario(name="grid-intersection",
                                  vehicles=vehicles,
                                  net_params=net_params,
                                  initial_config=initial_config,
                                  traffic_lights=tl_logic)

    env_name = "PO_TrafficLightGridEnv"
    pass_params = (env_name, sumo_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    horizon = env.horizon
    env = normalize(env)

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=40000,
        max_path_length=horizon,
        # whole_paths=True,
        n_itr=800,
        discount=0.999,
        # step_size=0.01,
    )
    algo.train()
예제 #35
0
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    if DEBUG:
        n_itr = 5
    else:
        n_itr = config.num_iter

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=config.batch_size,
        max_path_length=env.horizon,
        n_itr=n_itr,
        discount=config.discount,
        step_size=config.step_size,
        gae_lambda=config.gae_lambda,
        num_workers=config.num_workers,
        plot_learning_curve=config.plot_learning_curve,
        trial=agent_num,
    )
    avg_rewards, std_rewards = algo.train()

    print("training completed!")
    saveModel(algo.policy,
              'policy_{}_config_{}_agent_{}'.format(dynamic_environments[args.env_ind], args.config_num, agent_num))

    # save rewards per model over the iterations
    # also plot the rewards
    if config.plot_learning_curve:
예제 #36
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res']

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    tf_session = tf.Session()

    inner_env = normalize(PointMazeEnv(maze_id=v['maze_id']))

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    uniform_start_generator = UniformStateGenerator(state_size=v['start_size'],
                                                    bounds=v['start_range'],
                                                    center=v['start_center'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=uniform_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[:v['goal_size']],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        only_feasible=v['only_feasible'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    # initialize all logging arrays on itr0
    outer_iter = 0

    logger.log('Generating the Initial Heatmap...')
    plot_policy_means(policy,
                      env,
                      sampling_res=2,
                      report=report,
                      limit=v['start_range'],
                      center=v['start_center'])
    # test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'],
    #                      itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'])

    # GAN
    logger.log("Instantiating the GAN...")
    gan_configs = {key[4:]: value for key, value in v.items() if 'GAN_' in key}
    for key, value in gan_configs.items():
        if value is tf.train.AdamOptimizer:
            gan_configs[key] = tf.train.AdamOptimizer(gan_configs[key +
                                                                  '_stepSize'])
        if value is tflearn.initializations.truncated_normal:
            gan_configs[key] = tflearn.initializations.truncated_normal(
                stddev=gan_configs[key + '_stddev'])

    gan = StateGAN(
        state_size=v['start_size'],
        evaluater_size=v['num_labels'],
        state_range=v['start_range'],
        state_center=v['start_center'],
        state_noise_level=v['start_noise_level'],
        generator_layers=v['gan_generator_layers'],
        discriminator_layers=v['gan_discriminator_layers'],
        noise_size=v['gan_noise_size'],
        tf_session=tf_session,
        configs=gan_configs,
    )
    logger.log("pretraining the GAN...")
    if v['smart_init']:
        feasible_starts = generate_starts(
            env, starts=[v['ultimate_goal']],
            horizon=50)  # without giving the policy it does brownian mo.
        labels = np.ones((feasible_starts.shape[0],
                          2)).astype(np.float32)  # make them all good goals
        plot_labeled_states(feasible_starts,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'],
                            maze_id=v['maze_id'])

        dis_loss, gen_loss = gan.pretrain(states=feasible_starts,
                                          outer_iters=v['gan_outer_iters'])
        print("Loss of Gen and Dis: ", gen_loss, dis_loss)
    else:
        gan.pretrain_uniform(outer_iters=500,
                             report=report)  # v['gan_outer_iters'])

    # log first samples form the GAN
    initial_starts, _ = gan.sample_states_with_noise(v['num_new_starts'])

    logger.log("Labeling the starts")
    labels = label_states(initial_starts,
                          env,
                          policy,
                          v['horizon'],
                          as_goals=False,
                          n_traj=v['n_traj'],
                          key='goal_reached')

    plot_labeled_states(initial_starts,
                        labels,
                        report=report,
                        itr=outer_iter,
                        limit=v['goal_range'],
                        center=v['goal_center'],
                        maze_id=v['maze_id'])
    report.new_row()

    all_starts = StateCollection(distance_threshold=v['coll_eps'])

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        # Sample GAN
        logger.log("Sampling starts from the GAN")
        raw_starts, _ = gan.sample_states_with_noise(v['num_new_starts'])

        if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0:
            old_starts = all_starts.sample(v['num_old_starts'])
            starts = np.vstack([raw_starts, old_starts])
        else:
            starts = raw_starts

        with ExperimentLogger(log_dir,
                              'last',
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment start generator")
            env.update_start_generator(
                UniformListStateGenerator(
                    starts.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                discount=v['discount'],
                plot=False,
            )

            trpo_paths = algo.train()

        if v['use_trpo_paths']:
            logger.log("labeling starts with trpo rollouts")
            [starts, labels] = label_states_from_paths(
                trpo_paths,
                n_traj=2,
                key='goal_reached',  # using the min n_traj
                as_goal=False,
                env=env)
            paths = [path for paths in trpo_paths for path in paths]
        else:
            logger.log("labeling starts manually")
            labels, paths = label_states(starts,
                                         env,
                                         policy,
                                         v['horizon'],
                                         as_goals=False,
                                         n_traj=v['n_traj'],
                                         key='goal_reached',
                                         full_path=True)

        with logger.tabular_prefix("OnStarts_"):
            env.log_diagnostics(paths)
        plot_labeled_states(starts,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'],
                            maze_id=v['maze_id'])

        logger.log('Generating the Heatmap...')
        plot_policy_means(policy,
                          env,
                          sampling_res=2,
                          report=report,
                          limit=v['start_range'],
                          center=v['start_center'])
        test_and_plot_policy(policy,
                             env,
                             as_goals=False,
                             max_reward=v['max_reward'],
                             sampling_res=sampling_res,
                             n_traj=v['n_traj'],
                             itr=outer_iter,
                             report=report,
                             limit=v['goal_range'],
                             center=v['goal_center'])

        # ###### extra for deterministic:
        # logger.log("Labeling the goals deterministic")
        # with policy.set_std_to_0():
        #     labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1)
        # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'])

        labels = np.logical_and(labels[:, 0],
                                labels[:, 1]).astype(int).reshape((-1, 1))

        logger.log("Training the GAN")
        if np.any(labels):
            gan.train(
                starts,
                labels,
                v['gan_outer_iters'],
            )

        logger.dump_tabular(with_prefix=False)
        report.new_row()

        # append new goals to list of all goals (replay buffer): Not the low reward ones!!
        filtered_raw_start = [
            start for start, label in zip(starts, labels) if label[0] == 1
        ]
        all_starts.append(filtered_raw_start)
예제 #37
0
def train(num_experiments, thread_id, queue):

    ############ DEFAULT PARAMETERS ############

    env_name = None  #Name of adversarial environment
    path_length = 1000  #Maximum episode length
    layer_size = tuple([100, 100, 100])  #Layer definition
    ifRender = False  #Should we render?
    afterRender = 100  #After how many to animate
    n_exps = 1  #Number of training instances to run
    n_itr = 25  #Number of iterations of the alternating optimization
    n_pro_itr = 1  #Number of iterations for the protaginist
    n_adv_itr = 1  #Number of interations for the adversary
    batch_size = 4000  #Number of training samples for each iteration
    ifSave = True  #Should we save?
    save_every = 100  #Save checkpoint every save_every iterations
    n_process = 1  #Number of parallel threads for sampling environment
    adv_fraction = 0.25  #Fraction of maximum adversarial force to be applied
    step_size = 0.01  #kl step size for TRPO
    gae_lambda = 0.97  #gae_lambda for learner
    save_dir = './results'  #folder to save result in

    ############ ENV SPECIFIC PARAMETERS ############

    env_name = 'Walker2dAdv-v1'

    layer_size = tuple([64, 64])
    step_size = 0.1
    gae_lambda = 0.97
    batch_size = 25000

    n_exps = num_experiments
    n_itr = 500
    ifSave = False
    n_process = 4

    adv_fraction = 5.0

    save_dir = './../results/StaticWalker'

    args = [
        env_name, path_length, layer_size, ifRender, afterRender, n_exps,
        n_itr, n_pro_itr, n_adv_itr, batch_size, save_every, n_process,
        adv_fraction, step_size, gae_lambda, save_dir
    ]

    ############ ADVERSARIAL POLICY LOAD ############

    filepath = './../initial_results/Walker/env-Walker2dAdv-v1_Exp1_Itr1500_BS25000_Adv0.25_stp0.01_lam0.97_507500.p'
    res_D = pickle.load(open(filepath, 'rb'))
    pretrained_adv_policy = res_D['adv_policy']

    ############ MAIN LOOP ############

    ## Initializing summaries for the tests ##
    const_test_rew_summary = []
    rand_test_rew_summary = []
    step_test_rew_summary = []
    rand_step_test_rew_summary = []
    adv_test_rew_summary = []

    ## Preparing file to save results in ##
    save_prefix = 'static_env-{}_Exp{}_Itr{}_BS{}_Adv{}_stp{}_lam{}_{}'.format(
        env_name, n_exps, n_itr, batch_size, adv_fraction, step_size,
        gae_lambda, random.randint(0, 1000000))
    save_name = save_dir + '/' + save_prefix

    ## Looping over experiments to carry out ##
    for ne in range(n_exps):
        ## Environment definition ##
        ## The second argument in GymEnv defines the relative magnitude of adversary. For testing we set this to 1.0.
        env = normalize(GymEnv(env_name, adv_fraction))
        env_orig = normalize(GymEnv(env_name, 1.0))

        ## Protagonist policy definition ##
        pro_policy = GaussianMLPPolicy(env_spec=env.spec,
                                       hidden_sizes=layer_size,
                                       is_protagonist=True)
        pro_baseline = LinearFeatureBaseline(env_spec=env.spec)

        ## Zero Adversary for the protagonist training ##
        zero_adv_policy = ConstantControlPolicy(env_spec=env.spec,
                                                is_protagonist=False,
                                                constant_val=0.0)

        ## Adversary policy definition ##
        adv_policy = pretrained_adv_policy
        adv_baseline = LinearFeatureBaseline(env_spec=env.spec)

        ## Initializing the parallel sampler ##
        parallel_sampler.initialize(n_process)

        ## Optimizer for the Protagonist ##
        pro_algo = TRPO(env=env,
                        pro_policy=pro_policy,
                        adv_policy=adv_policy,
                        pro_baseline=pro_baseline,
                        adv_baseline=adv_baseline,
                        batch_size=batch_size,
                        max_path_length=path_length,
                        n_itr=n_pro_itr,
                        discount=0.995,
                        gae_lambda=gae_lambda,
                        step_size=step_size,
                        is_protagonist=True)

        ## Setting up summaries for testing for a specific training instance ##
        pro_rews = []
        adv_rews = []
        all_rews = []
        const_testing_rews = []
        const_testing_rews.append(
            test_const_adv(env_orig, pro_policy, path_length=path_length))
        rand_testing_rews = []
        rand_testing_rews.append(
            test_rand_adv(env_orig, pro_policy, path_length=path_length))
        step_testing_rews = []
        step_testing_rews.append(
            test_step_adv(env_orig, pro_policy, path_length=path_length))
        rand_step_testing_rews = []
        rand_step_testing_rews.append(
            test_rand_step_adv(env_orig, pro_policy, path_length=path_length))
        adv_testing_rews = []
        adv_testing_rews.append(
            test_learnt_adv(env,
                            pro_policy,
                            adv_policy,
                            path_length=path_length))

        ## Beginning alternating optimization ##
        for ni in range(n_itr):
            logger.log('\n\nThread: {} Experiment: {} Iteration: {}\n'.format(
                thread_id,
                ne,
                ni,
            ))

            ## Train Protagonist
            pro_algo.train()
            pro_rews += pro_algo.rews
            all_rews += pro_algo.rews
            logger.log('Protag Reward: {}'.format(
                np.array(pro_algo.rews).mean()))

            ## Test the learnt policies
            const_testing_rews.append(
                test_const_adv(env, pro_policy, path_length=path_length))
            rand_testing_rews.append(
                test_rand_adv(env, pro_policy, path_length=path_length))
            step_testing_rews.append(
                test_step_adv(env, pro_policy, path_length=path_length))
            rand_step_testing_rews.append(
                test_rand_step_adv(env, pro_policy, path_length=path_length))
            adv_testing_rews.append(
                test_learnt_adv(env,
                                pro_policy,
                                adv_policy,
                                path_length=path_length))

            if ni % afterRender == 0 and ifRender == True:
                test_const_adv(env,
                               pro_policy,
                               path_length=path_length,
                               n_traj=1,
                               render=True)

            if ni != 0 and ni % save_every == 0 and ifSave == True:
                ## SAVING CHECKPOINT INFO ##
                pickle.dump(
                    {
                        'args': args,
                        'pro_policy': pro_policy,
                        'adv_policy': adv_policy,
                        'zero_test': [const_testing_rews],
                        'rand_test': [rand_testing_rews],
                        'step_test': [step_testing_rews],
                        'rand_step_test': [rand_step_testing_rews],
                        'iter_save': ni,
                        'exp_save': ne,
                        'adv_test': [adv_testing_rews]
                    }, open(save_name + '_' + str(ni) + '.p', 'wb'))

        ## Shutting down the optimizer ##
        pro_algo.shutdown_worker()

        ## Updating the test summaries over all training instances
        const_test_rew_summary.append(const_testing_rews)
        rand_test_rew_summary.append(rand_testing_rews)
        step_test_rew_summary.append(step_testing_rews)
        rand_step_test_rew_summary.append(rand_step_testing_rews)
        adv_test_rew_summary.append(adv_testing_rews)

    queue.put([
        const_test_rew_summary, rand_test_rew_summary, step_test_rew_summary,
        rand_step_test_rew_summary, adv_test_rew_summary
    ])

    ############ SAVING MODEL ############
    '''
예제 #38
0
def run_task(*_):
    v_enter = 30
    inner_length = 800
    long_length = 100
    short_length = 800
    n = 1
    m = 5
    num_cars_left = 3
    num_cars_right = 3
    num_cars_top = 15
    num_cars_bot = 15
    tot_cars = (num_cars_left + num_cars_right) * m \
        + (num_cars_bot + num_cars_top) * n

    grid_array = {
        "short_length": short_length,
        "inner_length": inner_length,
        "long_length": long_length,
        "row_num": n,
        "col_num": m,
        "cars_left": num_cars_left,
        "cars_right": num_cars_right,
        "cars_top": num_cars_top,
        "cars_bot": num_cars_bot
    }

    sumo_params = SumoParams(sim_step=1, sumo_binary="sumo-gui")

    vehicles = Vehicles()
    vehicles.add(veh_id="idm",
                 acceleration_controller=(SumoCarFollowingController, {}),
                 sumo_car_following_params=SumoCarFollowingParams(
                     minGap=2.5,
                     max_speed=v_enter,
                 ),
                 routing_controller=(GridRouter, {}),
                 num_vehicles=tot_cars,
                 speed_mode="all_checks")

    additional_env_params = {
        "target_velocity": 50,
        "num_steps": 500,
        "control-length": 150,
        "switch_time": 3.0
    }
    env_params = EnvParams(additional_params=additional_env_params)

    additional_net_params = {
        "speed_limit": 35,
        "grid_array": grid_array,
        "horizontal_lanes": 1,
        "vertical_lanes": 1,
        "traffic_lights": True
    }

    initial_config, net_params = get_non_flow_params(10, additional_net_params)

    scenario = SimpleGridScenario(name="grid-intersection",
                                  generator_class=SimpleGridGenerator,
                                  vehicles=vehicles,
                                  net_params=net_params,
                                  initial_config=initial_config)

    env_name = "GreenWaveEnv"
    pass_params = (env_name, sumo_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    horizon = env.horizon
    env = normalize(env)

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=40000,
        max_path_length=horizon,
        # whole_paths=True,
        n_itr=800,
        discount=0.999,
        # step_size=0.01,
    )
    algo.train()
예제 #39
0
from humanoidopt.env import HumanoidOptEnv

from rllab.algos.trpo import TRPO
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline

from rllab.envs.normalized_env import normalize
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy

env = normalize(HumanoidOptEnv())

policy = GaussianMLPPolicy(
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32)
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=100,
    n_itr=40,
    discount=0.99,
    step_size=0.01,
)
algo.train()
예제 #40
0
                copyparams.update(modeparams)
                copyparams['layer'] = layer
                mdp = normalize(GymEnv(params['env'], **copyparams))
                for seed in seeds:
                    policy = GaussianMLPPolicy(env_spec=mdp.spec,
                                               hidden_sizes=(32, 32),
                                               init_std=10)

                    baseline = LinearFeatureBaseline(mdp.spec, )

                    batch_size = 50 * 250
                    algo = TRPO(env=mdp,
                                policy=policy,
                                baseline=baseline,
                                batch_size=batch_size,
                                whole_paths=True,
                                max_path_length=50,
                                n_itr=200,
                                step_size=0.01,
                                subsample_factor=1.0,
                                **copyparams)

                    run_experiment_lite(
                        algo.train(),
                        exp_prefix="r-inception-same-strike-std2",
                        n_parallel=4,
                        # dry=True,
                        snapshot_mode="all",
                        seed=seed,
                        mode="ec2_mujoco",
                        #terminate_machine=False
                    )
예제 #41
0
def main():
    now = datetime.datetime.now(dateutil.tz.tzlocal())
    rand_id = str(uuid.uuid4())[:5]
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z')
    default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id)

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--exp_name', type=str, default=default_exp_name, help='Name of the experiment.')

    parser.add_argument('--discount', type=float, default=0.95)
    parser.add_argument('--gae_lambda', type=float, default=0.99)

    parser.add_argument('--n_iter', type=int, default=250)
    parser.add_argument('--sampler_workers', type=int, default=1)
    parser.add_argument('--max_traj_len', type=int, default=250)
    parser.add_argument('--update_curriculum', action='store_true', default=False)
    parser.add_argument('--n_timesteps', type=int, default=8000)
    parser.add_argument('--control', type=str, default='centralized')

    parser.add_argument('--control', type=str, default='centralized')
    parser.add_argument('--buffer_size', type=int, default=1)
    parser.add_argument('--n_good', type=int, default=3)
    parser.add_argument('--n_hostage', type=int, default=5)
    parser.add_argument('--n_bad', type=int, default=5)
    parser.add_argument('--n_coop_save', type=int, default=2)
    parser.add_argument('--n_coop_avoid', type=int, default=2)
    parser.add_argument('--n_sensors', type=int, default=20)
    parser.add_argument('--sensor_range', type=float, default=0.2)
    parser.add_argument('--save_reward', type=float, default=3)
    parser.add_argument('--hit_reward', type=float, default=-1)
    parser.add_argument('--encounter_reward', type=float, default=0.01)
    parser.add_argument('--bomb_reward', type=float, default=-10.)

    parser.add_argument('--recurrent', action='store_true', default=False)
    parser.add_argument('--baseline_type', type=str, default='linear')
    parser.add_argument('--policy_hidden_sizes', type=str, default='128,128')
    parser.add_argument('--baselin_hidden_sizes', type=str, default='128,128')

    parser.add_argument('--max_kl', type=float, default=0.01)

    parser.add_argument('--log_dir', type=str, required=False)
    parser.add_argument('--tabular_log_file', type=str, default='progress.csv',
                        help='Name of the tabular log file (in csv).')
    parser.add_argument('--text_log_file', type=str, default='debug.log',
                        help='Name of the text log file (in pure text).')
    parser.add_argument('--params_log_file', type=str, default='params.json',
                        help='Name of the parameter log file (in json).')
    parser.add_argument('--seed', type=int,
                        help='Random seed for numpy')
    parser.add_argument('--args_data', type=str,
                        help='Pickled data for stub objects')
    parser.add_argument('--snapshot_mode', type=str, default='all',
                        help='Mode to save the snapshot. Can be either "all" '
                             '(all iterations will be saved), "last" (only '
                             'the last iteration will be saved), or "none" '
                             '(do not save snapshots)')
    parser.add_argument('--log_tabular_only', type=ast.literal_eval, default=False,
                        help='Whether to only print the tabular log information (in a horizontal format)')


    args = parser.parse_args()

    parallel_sampler.initialize(n_parallel=args.sampler_workers)

    if args.seed is not None:
        set_seed(args.seed)
        parallel_sampler.set_seed(args.seed)

    args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(',')))

    centralized = True if args.control == 'centralized' else False

    sensor_range = np.array(map(float, args.sensor_range.split(',')))
    assert sensor_range.shape == (args.n_pursuers,)

    env = ContinuousHostageWorld(args.n_good, args.n_hostage, args.n_bad, args.n_coop_save,
                                 args.n_coop_avoid, n_sensors=args.n_sensors,
                                 sensor_range=args.sensor_range, save_reward=args.save_reward,
                                 hit_reward=args.hit_reward, encounter_reward=args.encounter_reward,
                                 bomb_reward=args.bomb_reward)

    env = RLLabEnv(StandardizedEnv(env), mode=args.control)

    if args.buffer_size > 1:
        env = ObservationBuffer(env, args.buffer_size)

    if args.recurrent:
        policy = GaussianGRUPolicy(env_spec=env.spec, hidden_sizes=args.hidden_sizes)
    else:
        policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=args.hidden_sizes)

    if args.baseline_type == 'linear':
        baseline = LinearFeatureBaseline(env_spec=env.spec)
    else:
        baseline = ZeroBaseline(obsfeat_space)

    # logger
    default_log_dir = config.LOG_DIR
    if args.log_dir is None:
        log_dir = osp.join(default_log_dir, args.exp_name)
    else:
        log_dir = args.log_dir
    tabular_log_file = osp.join(log_dir, args.tabular_log_file)
    text_log_file = osp.join(log_dir, args.text_log_file)
    params_log_file = osp.join(log_dir, args.params_log_file)

    logger.log_parameters_lite(params_log_file, args)
    logger.add_text_output(text_log_file)
    logger.add_tabular_output(tabular_log_file)
    prev_snapshot_dir = logger.get_snapshot_dir()
    prev_mode = logger.get_snapshot_mode()
    logger.set_snapshot_dir(log_dir)
    logger.set_snapshot_mode(args.snapshot_mode)
    logger.set_log_tabular_only(args.log_tabular_only)
    logger.push_prefix("[%s] " % args.exp_name)

    algo = TRPO(env=env,
            policy=policy,
            baseline=baseline,
            batch_size=args.n_timesteps,
            max_path_length=args.max_traj_len,
            n_itr=args.n_iter,
            discount=args.discount,
            step_size=args.max_kl,
            mode=args.control,)

    algo.train()
예제 #42
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res']
    samples_per_cell = 10  # for the oracle rejection sampling

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(PointMazeEnv(maze_id=v['maze_id']))

    uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'],
                                                   bounds=v['goal_range'],
                                                   center=v['goal_center'])
    env = GoalExplorationEnv(
        env=inner_env,
        goal_generator=uniform_goal_generator,
        obs2goal_transform=lambda x: x[:int(len(x) / 2)],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        only_feasible=v['only_feasible'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    # initialize all logging arrays on itr0
    outer_iter = 0

    logger.log('Generating the Initial Heatmap...')
    test_and_plot_policy(policy,
                         env,
                         max_reward=v['max_reward'],
                         sampling_res=sampling_res,
                         n_traj=v['n_traj'],
                         itr=outer_iter,
                         report=report,
                         center=v['goal_center'],
                         limit=v['goal_range'])
    report.new_row()

    all_goals = StateCollection(distance_threshold=v['coll_eps'])
    total_rollouts = 0

    for outer_iter in range(1, v['outer_iters']):
        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling goals")

        goals = np.array([]).reshape((-1, v['goal_size']))
        k = 0
        while goals.shape[0] < v['num_new_goals']:
            print('good goals collected: ', goals.shape[0])
            logger.log("Sampling and labeling the goals: %d" % k)
            k += 1
            unif_goals = sample_unif_feas(env,
                                          samples_per_cell=samples_per_cell)
            labels = label_states(unif_goals,
                                  env,
                                  policy,
                                  v['horizon'],
                                  n_traj=v['n_traj'],
                                  key='goal_reached')
            logger.log("Converting the labels")
            init_classes, text_labels = convert_label(labels)
            goals = np.concatenate([goals,
                                    unif_goals[init_classes == 2]]).reshape(
                                        (-1, v['goal_size']))

        if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0:
            old_goals = all_goals.sample(
                v['num_old_goals'])  #todo: replay noise?
            goals = np.vstack([goals, old_goals])

        with ExperimentLogger(log_dir,
                              'last',
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment goal generator")
            env.update_goal_generator(
                UniformListStateGenerator(
                    goals.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                plot=False,
            )

            algo.train()

        logger.log('Generating the Heatmap...')
        test_and_plot_policy(policy,
                             env,
                             max_reward=v['max_reward'],
                             sampling_res=sampling_res,
                             n_traj=v['n_traj'],
                             itr=outer_iter,
                             report=report,
                             center=v['goal_center'],
                             limit=v['goal_range'])

        logger.log("Labeling the goals")
        labels = label_states(goals,
                              env,
                              policy,
                              v['horizon'],
                              n_traj=v['n_traj'],
                              key='goal_reached')

        plot_labeled_states(goals,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'],
                            maze_id=v['maze_id'])

        # ###### extra for deterministic:
        # logger.log("Labeling the goals deterministic")
        # with policy.set_std_to_0():
        #     labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1)
        # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'])

        labels = np.logical_and(labels[:, 0],
                                labels[:, 1]).astype(int).reshape((-1, 1))

        # rollouts used for labeling (before TRPO itrs):
        num_empty_spaces = len(unwrap_maze(env).find_empty_space())
        logger.record_tabular(
            'LabelingRollouts',
            k * v['n_traj'] * samples_per_cell * num_empty_spaces)
        total_rollouts += k * v['n_traj'] * samples_per_cell * num_empty_spaces
        logger.record_tabular('TotalLabelingRollouts', total_rollouts)

        logger.dump_tabular(with_prefix=False)
        report.new_row()

        # append new goals to list of all goals (replay buffer): Not the low reward ones!!
        filtered_raw_goals = [
            goal for goal, label in zip(goals, labels) if label[0] == 1
        ]
        all_goals.append(filtered_raw_goals)
예제 #43
0
def run_task(*_):
    """Implement the run_task method needed to run experiments with rllab."""
    sim_params = AimsunParams(sim_step=0.5, render=False, seed=0)

    vehicles = VehicleParams()
    vehicles.add(veh_id="rl",
                 acceleration_controller=(RLController, {}),
                 routing_controller=(ContinuousRouter, {}),
                 num_vehicles=1)
    vehicles.add(veh_id="idm",
                 acceleration_controller=(IDMController, {}),
                 routing_controller=(ContinuousRouter, {}),
                 num_vehicles=21)

    additional_env_params = {
        "target_velocity": 8,
        "ring_length": None,
        "max_accel": 1,
        "max_decel": 1
    }
    env_params = EnvParams(horizon=HORIZON,
                           additional_params=additional_env_params,
                           warmup_steps=1500)

    additional_net_params = {
        "length": 230,
        "lanes": 1,
        "speed_limit": 30,
        "resolution": 40
    }
    net_params = NetParams(additional_params=additional_net_params)

    initial_config = InitialConfig(spacing="uniform", bunching=50)

    print("XXX name", exp_tag)
    scenario = LoopScenario(exp_tag,
                            vehicles,
                            net_params,
                            initial_config=initial_config)

    env_name = "WaveAttenuationPOEnv"
    simulator = 'aimsun'
    pass_params = (env_name, sim_params, vehicles, env_params, net_params,
                   initial_config, scenario, simulator)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    horizon = env.horizon
    env = normalize(env)

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(3, 3),
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=15000,
        max_path_length=horizon,
        n_itr=500,
        # whole_paths=True,
        discount=0.999,
        # step_size=v["step_size"],
    )
    algo.train(),
예제 #44
0
파일: trpo_cartpole.py 프로젝트: cknd/rllab
env = normalize(CartpoleEnv())

policy = GaussianMLPPolicy(
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(100, 50, 25)
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=50000,
    max_path_length=500,
    n_itr=500,
    discount=0.99,
    step_size=0.1,
)

rets_per_episode_batchwise = algo.train()
rets_per_episode = [x for lst in rets_per_episode_batchwise for x in lst]



print('mean return over all episodes', np.mean(rets_per_episode))

plt.plot(rets_per_episode, alpha=0.3)
plt.savefig('/tmp/upsi/test_rllab/trpo_cartpole.png')