def run_task(*_): # Please note that different environments with different action spaces may require different # policies. For example with a Box action space, a GaussianMLPPolicy works, but for a Discrete # action space may need to use a CategoricalMLPPolicy (see the trpo_gym_cartpole.py example) env = normalize(GymEnv("Pendulum-v0")) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=50, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def test_issue_3(): """ As reported in https://github.com/rllab/rllab/issues/3, the adaptive_std parameter was not functioning properly """ env = CartpoleEnv() policy = GaussianMLPPolicy(env_spec=env, adaptive_std=True) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=100, n_itr=1) algo.train()
def test_trpo_relu_nan(): env = DummyEnv() policy = GaussianMLPPolicy( env_spec=env.spec, hidden_nonlinearity=naive_relu, hidden_sizes=(1,)) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, n_itr=1, batch_size=1000, max_path_length=100, step_size=0.001 ) algo.train() assert not np.isnan(np.sum(policy.get_param_values()))
def test_trpo_deterministic_nan(): env = DummyEnv() policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(1,)) policy._l_log_std.param.set_value([np.float32(np.log(1e-8))]) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, n_itr=10, batch_size=1000, max_path_length=100, step_size=0.01 ) algo.train() assert not np.isnan(np.sum(policy.get_param_values()))
def run_experiment(**params): base_params = copy.copy(DEFAULTS) base_params.update(params) params = base_params pprint(params) grid_world = SlaveGridWorldEnv("walled_chain", max_traj_length=DEFAULTS["max_path_length"], goal_reward=params["goal_reward"]) agent = GridWorldMasterAgent(grid_world, match_reward=params["match_reward"]) env = normalize(SituatedConversationEnvironment(env=grid_world, b_agent=agent)) baseline = LinearFeatureBaseline(env) policy = RecurrentCategoricalPolicy( name="policy", env_spec=env.spec, hidden_dims=params["policy_hidden_dims"], feature_network=MLPNetworkWithEmbeddings( "feature_network", env.observation_space.flat_dim, params["feature_dim"], params["feature_hidden_dims"], tf.tanh, tf.tanh, agent.vocab_size, params["embedding_dim"]), state_include_action=False, ) optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=params["batch_size"], max_path_length=params["max_path_length"], n_itr=params["n_itr"], discount=0.99, step_size=params["step_size"], optimizer=optimizer, ) run_experiment_lite( algo.train(), n_parallel=15, snapshot_mode="last", exp_prefix="grid_world_sweep3", variant=params, )
def run_task(*_): env = normalize(GymEnv("Pendulum-v0")) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=50, discount=0.99, step_size=0.01, plot=True, ) algo.train()
def run_experiment(**params): base_params = copy.copy(DEFAULTS) base_params.update(params) params = base_params grid_world = SlaveGridWorldEnv("3x3", goal_reward=params["goal_reward"]) env = normalize(grid_world) baseline = LinearFeatureBaseline(env) policy = CategoricalMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=params["policy_hidden_dims"], ) optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=params["batch_size"], max_path_length=5, n_itr=params["n_itr"], discount=0.99, step_size=params["step_size"], optimizer=optimizer, ) run_experiment_lite( algo.train(), n_parallel=5, snapshot_mode="last", exp_prefix="grid_world_silent", variant=params, )
def run_task(v): env = normalize(CartpoleEnv()) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=v["step_size"], # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): auton_cars = 20 sumo_params = SumoParams(time_step=0.1, human_speed_mode="no_collide", rl_speed_mode="no_collide", sumo_binary="sumo-gui") vehicles = Vehicles() vehicles.add_vehicles("idm", (RLController, {}), None, None, 0, 20) intensity = .2 v_enter = 10 env_params = EnvParams(additional_params={ "target_velocity": v_enter, "control-length": 150, "max_speed": v_enter }) additional_net_params = { "horizontal_length_in": 400, "horizontal_length_out": 800, "horizontal_lanes": 1, "vertical_length_in": 400, "vertical_length_out": 800, "vertical_lanes": 1, "speed_limit": { "horizontal": v_enter, "vertical": v_enter } } net_params = NetParams(no_internal_links=False, additional_params=additional_net_params) cfg_params = {"start_time": 0, "end_time": 3000, "cfg_path": "debug/cfg/"} initial_config = InitialConfig(spacing="custom", additional_params={ "intensity": intensity, "enter_speed": v_enter }) scenario = TwoWayIntersectionScenario("two-way-intersection", TwoWayIntersectionGenerator, vehicles, net_params, initial_config=initial_config) env = TwoIntersectionEnvironment(env_params, sumo_params, scenario) env_name = "TwoIntersectionEnvironment" pass_params = (env_name, sumo_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) logging.info("Experiment Set Up complete") print("experiment initialized") env = normalize(env) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=30000, max_path_length=horizon, # whole_paths=True, n_itr=200, discount=0.999, # step_size=0.01, ) algo.train()
policy = CategoricalMLPPolicy(env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) # bonus_evaluators = [GridBonusEvaluator(mesh_density=mesh_density, visitation_bonus=1, snn_H_bonus=0)] # reward_coef_bonus = [reward_coef] algo = TRPO( env=env, policy=policy, baseline=baseline, self_normalize=True, log_deterministic=True, # reward_coef=reward_coef, # bonus_evaluator=bonus_evaluators, # reward_coef_bonus=reward_coef_bonus, batch_size=1e6 / time_step_agg, whole_paths=True, max_path_length=1e4 / time_step_agg * maze_size_scaling / 2., # correct for larger envs n_itr=200, discount=0.99, step_size=0.01, ) for s in [10, 20, 30]: # range(10, 110, 10): # [10, 20, 30, 40, 50]: exp_prefix = 'hier-snn-egoSnake-maze0' now = datetime.datetime.now(dateutil.tz.tzlocal()) timestamp = now.strftime('%Y_%m_%d_%H_%M_%S') exp_name = exp_prefix + '{}scale_{}agg_{}pl_PRE{}_seed{}_{}'.format(
def run_task(*_): """Implement the run_task method needed to run experiments with rllab.""" V_ENTER = 30 INNER_LENGTH = 300 LONG_LENGTH = 100 SHORT_LENGTH = 300 N_ROWS = 3 N_COLUMNS = 3 NUM_CARS_LEFT = 1 NUM_CARS_RIGHT = 1 NUM_CARS_TOP = 1 NUM_CARS_BOT = 1 tot_cars = (NUM_CARS_LEFT + NUM_CARS_RIGHT) * N_COLUMNS \ + (NUM_CARS_BOT + NUM_CARS_TOP) * N_ROWS grid_array = { "short_length": SHORT_LENGTH, "inner_length": INNER_LENGTH, "long_length": LONG_LENGTH, "row_num": N_ROWS, "col_num": N_COLUMNS, "cars_left": NUM_CARS_LEFT, "cars_right": NUM_CARS_RIGHT, "cars_top": NUM_CARS_TOP, "cars_bot": NUM_CARS_BOT } sim_params = SumoParams(sim_step=1, render=True) vehicles = VehicleParams() vehicles.add(veh_id="idm", acceleration_controller=(SimCarFollowingController, {}), car_following_params=SumoCarFollowingParams( min_gap=2.5, tau=1.1, max_speed=V_ENTER, speed_mode="all_checks"), routing_controller=(GridRouter, {}), num_vehicles=tot_cars) tl_logic = TrafficLightParams(baseline=False) additional_env_params = { "target_velocity": 50, "switch_time": 3.0, "num_observed": 2, "discrete": False, "tl_type": "controlled" } env_params = EnvParams(additional_params=additional_env_params) additional_net_params = { "speed_limit": 35, "grid_array": grid_array, "horizontal_lanes": 1, "vertical_lanes": 1 } if USE_INFLOWS: initial_config, net_params = get_flow_params( v_enter=V_ENTER, vehs_per_hour=EDGE_INFLOW, col_num=N_COLUMNS, row_num=N_ROWS, add_net_params=additional_net_params) else: initial_config, net_params = get_non_flow_params( V_ENTER, additional_net_params) scenario = SimpleGridScenario(name="grid-intersection", vehicles=vehicles, net_params=net_params, initial_config=initial_config, traffic_lights=tl_logic) env_name = "PO_TrafficLightGridEnv" pass_params = (env_name, sim_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=40000, max_path_length=horizon, # whole_paths=True, n_itr=800, discount=0.999, # step_size=0.01, ) algo.train()
hidden_sizes=layer_size, is_protagonist=False ) adv_baseline = LinearFeatureBaseline(env_spec=env.spec) ## Initializing the parallel sampler ## parallel_sampler.initialize(n_process) ## Optimizer for the Protagonist ## pro_algo = TRPO( env=env, pro_policy=pro_policy, adv_policy=adv_policy, pro_baseline=pro_baseline, adv_baseline=adv_baseline, batch_size=batch_size, max_path_length=path_length, n_itr=n_pro_itr, discount=0.995, gae_lambda=gae_lambda, step_size=step_size, is_protagonist=True ) ## Optimizer for the Adversary ## adv_algo = TRPO( env=env, pro_policy=pro_policy, adv_policy=adv_policy, pro_baseline=pro_baseline, adv_baseline=adv_baseline, batch_size=batch_size,
folder_name = 'cartpole_split_sanitycheck' segmentation_num = 2 load_path_from_file = False load_metric_from_file = False split_percentage = 0.2 # generate data baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=10000, max_path_length=env.horizon, n_itr=5, discount=0.995, step_size=0.01, gae_lambda=0.97, ) algo.init_opt() if not load_path_from_file: init_param = policy.get_param_values() init_param_obj = copy.deepcopy(policy.get_params()) from rllab.sampler import parallel_sampler parallel_sampler.initialize(n_parallel=2)
def average_error(env, policy, batch_size, gt_gradient): np.random.seed(0) baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0) init_param = policy.get_param_values() algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=batch_size, max_path_length=env.horizon, n_itr=5, discount=0.995, step_size=0.01, gae_lambda=0.97, ) gradients_vanilla = [] gradients_randwalk = [] gradient_error_vanilla = [] gradient_error_randwalk = [] env.wrapped_env.env.env.perturb_MP = True algo.start_worker() algo.init_opt() for i in range(20): policy.set_param_values(init_param) # reset the policy parameters paths = algo.sampler.obtain_samples(0) samples_data = algo.sampler.process_samples(0, paths) samples_data = algo.sampler.process_samples(0, paths) grad = get_gradient(algo, samples_data) gradients_randwalk.append(grad) gradient_error_randwalk.append(np.linalg.norm(grad - gt_gradient)) algo.shutdown_worker() env.wrapped_env.env.env.perturb_MP = False algo.start_worker() algo.init_opt() for i in range(20): policy.set_param_values(init_param) # reset the policy parameters paths = algo.sampler.obtain_samples(0) samples_data = algo.sampler.process_samples(0, paths) samples_data = algo.sampler.process_samples(0, paths) grad = get_gradient(algo, samples_data) gradients_vanilla.append(grad) gradient_error_vanilla.append(np.linalg.norm(grad - gt_gradient)) algo.shutdown_worker() print(np.std(gradients_vanilla, axis=0).shape) print(np.linalg.norm(np.mean(gradients_vanilla, axis=0)), np.mean(np.std(gradients_vanilla, axis=0))) print(np.mean(gradient_error_vanilla)) print('randwalk') print(np.linalg.norm(np.mean(gradients_randwalk, axis=0)), np.mean(np.std(gradients_randwalk, axis=0))) print(np.mean(gradient_error_randwalk)) return np.mean(gradient_error_vanilla), np.mean(gradient_error_randwalk)
from madrl_environments import StandardizedEnv from madrl_environments.pursuit import MAWaterWorld from rllabwrapper import RLLabEnv from rllab.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.envs.normalized_env import normalize from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.policies.gaussian_gru_policy import GaussianGRUPolicy env = StandardizedEnv(MAWaterWorld(3, 10, 2, 5)) env = RLLabEnv(env) policy = GaussianGRUPolicy(env_spec=env.spec, hidden_sizes=(32,)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=8000, max_path_length=200, n_itr=500, discount=0.99, step_size=0.01, mode='decentralized',) algo.train()
from rllab.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from examples.point_env import PointEnv from rllab.envs.normalized_env import normalize from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy env = normalize(PointEnv()) policy = GaussianMLPPolicy( env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, ) algo.train()
def train(env, policy, policy_init, num_episodes, episode_cap, horizon, **alg_args): if env.startswith('rllab.'): # Get env name and class env_name = re.match('rllab.(\S+)', env).group(1) env_rllab_class = rllab_env_from_name(env_name) env = normalize(env_rllab_class()) else: raise Exception('Only working for RLLAB envs') # Policy initialization if policy_init == 'zeros': initializer = LI.Constant(0) elif policy_init == 'normal': initializer = LI.Normal() else: raise Exception('Unrecognized policy initialization.') # Setting the policy type if policy == 'linear': hidden_sizes = tuple() elif policy == 'simple-nn': hidden_sizes = [16] else: raise Exception('NOT IMPLEMENTED.') # Creating the policy obs_dim = env.observation_space.flat_dim action_dim = env.action_space.flat_dim mean_network = MLP( input_shape=(obs_dim, ), output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, output_b_init=None, output_W_init=initializer, ) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=hidden_sizes, mean_network=mean_network, log_weights=True, ) # Creating baseline baseline = LinearFeatureBaseline(env_spec=env.spec) # Adding max_episodes constraint. If -1, this is unbounded if episode_cap: alg_args['max_episodes'] = num_episodes # Run algorithm algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=horizon * num_episodes, whole_paths=True, max_path_length=horizon, **alg_args) algo.train() print('----- ENDING ------') print(policy.get_param_values())
def run_task(*_): """Implement the run_task method needed to run experiments with rllab.""" sim_params = SumoParams(sim_step=0.2, render=True) # note that the vehicles are added sequentially by the scenario, # so place the merging vehicles after the vehicles in the ring vehicles = VehicleParams() # Inner ring vehicles vehicles.add(veh_id="human", acceleration_controller=(IDMController, { "noise": 0.2 }), lane_change_controller=(SimLaneChangeController, {}), routing_controller=(ContinuousRouter, {}), num_vehicles=6, car_following_params=SumoCarFollowingParams(minGap=0.0, tau=0.5), lane_change_params=SumoLaneChangeParams()) # A single learning agent in the inner ring vehicles.add(veh_id="rl", acceleration_controller=(RLController, {}), lane_change_controller=(SimLaneChangeController, {}), routing_controller=(ContinuousRouter, {}), num_vehicles=1, car_following_params=SumoCarFollowingParams( minGap=0.01, tau=0.5, speed_mode="no_collide"), lane_change_params=SumoLaneChangeParams()) # Outer ring vehicles vehicles.add(veh_id="merge-human", acceleration_controller=(IDMController, { "noise": 0.2 }), lane_change_controller=(SimLaneChangeController, {}), routing_controller=(ContinuousRouter, {}), num_vehicles=10, car_following_params=SumoCarFollowingParams(minGap=0.0, tau=0.5), lane_change_params=SumoLaneChangeParams()) env_params = EnvParams(horizon=HORIZON, additional_params={ "max_accel": 3, "max_decel": 3, "target_velocity": 10, "n_preceding": 2, "n_following": 2, "n_merging_in": 2, }) additional_net_params = ADDITIONAL_NET_PARAMS.copy() additional_net_params["ring_radius"] = 50 additional_net_params["inner_lanes"] = 1 additional_net_params["outer_lanes"] = 1 additional_net_params["lane_length"] = 75 net_params = NetParams(no_internal_links=False, additional_params=additional_net_params) initial_config = InitialConfig(x0=50, spacing="uniform", additional_params={"merge_bunching": 0}) scenario = TwoLoopsOneMergingScenario(name=exp_tag, vehicles=vehicles, net_params=net_params, initial_config=initial_config) env_name = "TwoLoopsMergePOEnv" pass_params = (env_name, sim_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(100, 50, 25)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=64 * 3 * horizon, max_path_length=horizon, # whole_paths=True, n_itr=1000, discount=0.999, # step_size=0.01, ) algo.train()
""" Returns a Space object """ low = np.array( [0, -np.pi / 2, -np.pi / 2, 0, -np.pi, -np.pi, 0, -np.pi, -np.pi]) high = np.array([ 100, np.pi / 2, np.pi / 2, 1000, np.pi, np.pi, 1000, np.pi, -np.pi ]) return Box(low=low, high=high) def log_diagnostics(self, paths): pass if __name__ == "__main__": from rllab.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.envs.normalized_env import normalize from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy env = normalize(FlightEnv()) policy = GaussianMLPPolicy(env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, max_path_length=400, batch_size=4000, gae_lambda=0.7) algo.train()
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 0 if 'sampling_res' not in v.keys() else v['sampling_res'] # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) tf_session = tf.Session() inner_env = normalize(AntEnv()) uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'], center=v['goal_center']) env = GoalExplorationEnv( env=inner_env, goal_generator=uniform_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], append_transformed_obs=v['append_transformed_obs'], append_extra_info=v['append_extra_info'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) if v['baseline'] == 'g_mlp': baseline = GaussianMLPBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], bounds=v['goal_range']) # GAN logger.log("Instantiating the GAN...") gan_configs = {key[4:]: value for key, value in v.items() if 'GAN_' in key} for key, value in gan_configs.items(): if value is tf.train.AdamOptimizer: gan_configs[key] = tf.train.AdamOptimizer(gan_configs[key + '_stepSize']) if value is tflearn.initializations.truncated_normal: gan_configs[key] = tflearn.initializations.truncated_normal( stddev=gan_configs[key + '_stddev']) gan = StateGAN( state_size=v['goal_size'], evaluater_size=v['num_labels'], state_range=v['goal_range'], state_center=v['goal_center'], state_noise_level=v['goal_noise_level'], generator_layers=v['gan_generator_layers'], discriminator_layers=v['gan_discriminator_layers'], noise_size=v['gan_noise_size'], tf_session=tf_session, configs=gan_configs, ) # log first samples form the GAN initial_goals, _ = gan.sample_states_with_noise(v['num_new_goals']) logger.log("Labeling the goals") labels = label_states(initial_goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(initial_goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) report.new_row() all_goals = StateCollection(distance_threshold=v['coll_eps']) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) feasible_goals = generate_initial_goals(env, policy, v['goal_range'], goal_center=v['goal_center'], horizon=v['horizon']) labels = np.ones((feasible_goals.shape[0], 2)).astype(np.float32) # make them all good goals plot_labeled_states(feasible_goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], summary_string_base='On-policy Goals:\n') if v['only_on_policy']: goals = feasible_goals[np.random.choice( feasible_goals.shape[0], v['num_new_goals'], replace=False), :] else: logger.log("Training the GAN") gan.pretrain(feasible_goals, v['gan_outer_iters']) # Sample GAN logger.log("Sampling goals from the GAN") raw_goals, _ = gan.sample_states_with_noise(v['num_new_goals']) if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0: old_goals = all_goals.sample(v['num_old_goals']) goals = np.vstack([raw_goals, old_goals]) else: goals = raw_goals with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment goal generator") env.update_goal_generator( UniformListStateGenerator( goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, plot=False, ) trpo_paths = algo.train() if v['use_trpo_paths']: logger.log("labeling starts with trpo rollouts") [goals, labels] = label_states_from_paths( trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj as_goal=True, env=env) paths = [path for paths in trpo_paths for path in paths] else: logger.log("labeling starts manually") labels, paths = label_states(goals, env, policy, v['horizon'], as_goals=True, n_traj=v['n_traj'], key='goal_reached', full_path=True) with logger.tabular_prefix("OnStarts_"): env.log_diagnostics(paths) logger.log('Generating the Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], bounds=v['goal_range']) plot_labeled_states(goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) logger.dump_tabular(with_prefix=False) report.new_row() # append new goals to list of all goals (replay buffer): Not the low reward ones!! filtered_raw_goals = [ goal for goal, label in zip(goals, labels) if label[0] == 1 ] # this is not used if no replay buffer all_goals.append(filtered_raw_goals) if v['add_on_policy']: logger.log("sampling on policy") feasible_goals = generate_initial_goals( env, policy, v['goal_range'], goal_center=v['goal_center'], horizon=v['horizon']) # downsampled_feasible_goals = feasible_goals[np.random.choice(feasible_goals.shape[0], v['add_on_policy']),:] all_goals.append(feasible_goals)
def main(): parser = argparse.ArgumentParser() parser.add_argument('env_fname', type=str, help='config file with environment arguments') parser.add_argument('transformers_fname', type=str) parser.add_argument('mean_network_type', type=str, choices=['conv', 'siamese']) parser.add_argument('--conv_filters', nargs='*', type=int, default=[16, 32]) parser.add_argument('--hidden_sizes', nargs='*', type=int, default=[16]) parser.add_argument('--init_std', type=float, default=1.0) parser.add_argument('--n_itr', type=int, default=100) parser.add_argument('--step_size', type=float, default=0.01) parser.add_argument('--batch_size', type=int, default=10000) parser.add_argument('--use_static_car', action='store_true') parser.add_argument('--use_init_heuristic', action='store_true') args = parser.parse_args() with open(args.env_fname) as yaml_string: env_config = yaml.load(yaml_string) if issubclass(env_config['class'], envs.RosEnv): import rospy rospy.init_node("generate_data") env = from_config(env_config) if args.use_static_car: env.car_env.speed_offset_space.low = \ env.car_env.speed_offset_space.high = np.array([0.0, 4.0]) # transformers with open(args.transformers_fname) as transformers_file: transformers_config = yaml.load(transformers_file) transformers = dict() for data_name, transformer_config in transformers_config.items(): if data_name == 'action': replace_config = {'space': env.action_space} elif data_name in env.observation_space.spaces: replace_config = {'space': env.observation_space.spaces[data_name]} else: replace_config = {} transformers[data_name] = from_config(transformers_config[data_name], replace_config=replace_config) env = ServoingEnv(env) env = RllabEnv(env, transformers=transformers) env = normalize(env) network_kwargs = dict( input_shape=env.observation_space.shape, output_dim=env.action_space.flat_dim, conv_filters=args.conv_filters, conv_filter_sizes=[3] * len(args.conv_filters), conv_strides=[2] * len(args.conv_filters), conv_pads=[0] * len(args.conv_filters), hidden_sizes=args.hidden_sizes, hidden_nonlinearity=LN.rectify, output_nonlinearity=None, name="mean_network", ) if args.mean_network_type == 'conv': mean_network = ConvNetwork(**network_kwargs) elif args.mean_network_type == 'siamese': mean_network = SiameseQuadraticErrorNetwork(**network_kwargs) else: raise NotImplementedError policy = GaussianConvPolicy( env_spec=env.spec, init_std=args.init_std, mean_network=mean_network, ) if args.use_init_heuristic: W_var = policy.get_params()[0] W = W_var.get_value() W[:, 3:, :, :] = -W[:, :3, :, :] W_var.set_value(W) baseline = GaussianConvBaseline( env_spec=env.spec, regressor_args=dict( use_trust_region=True, step_size=args.step_size, normalize_inputs=True, normalize_outputs=True, hidden_sizes=args.hidden_sizes, conv_filters=args.conv_filters, conv_filter_sizes=[3] * len(args.conv_filters), conv_strides=[2] * len(args.conv_filters), conv_pads=[0] * len(args.conv_filters), batchsize=args.batch_size * 10, )) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=args.batch_size, max_path_length=100, n_itr=args.n_itr, discount=0.9, step_size=args.step_size, ) algo.train() import IPython as ipy ipy.embed()
def run_task(vv, log_dir=None, exp_name=None): global policy global baseline trpo_stepsize = 0.01 trpo_subsample_factor = 0.2 # Check if variant is available if vv['model_type'] not in ['BrushTireModel', 'LinearTireModel']: raise ValueError('Unrecognized model type for simulating robot') if vv['robot_type'] not in ['MRZR', 'RCCar']: raise ValueError('Unrecognized robot type') # Load environment if not vv['use_ros']: env = StraightEnv(target_velocity=vv['target_velocity'], dt=vv['dt'], model_type=vv['model_type'], robot_type=vv['robot_type'], mu_s=vv['mu_s'], mu_k=vv['mu_k']) else: from aa_simulation.envs.straight.straight_env_ros import StraightEnvROS env = StraightEnvROS(target_velocity=vv['target_velocity'], dt=vv['dt'], model_type=vv['model_type'], robot_type=vv['robot_type']) # Save variant information for comparison plots variant_file = logger.get_snapshot_dir() + '/variant.json' logger.log_variant(variant_file, vv) # Set variance for each action component separately for exploration # Note: We set the variance manually because we are not scaling our # action space during training. init_std_speed = vv['target_velocity'] / 4 init_std_steer = np.pi / 6 init_std = [init_std_speed, init_std_steer] # Build policy and baseline networks # Note: Mean of policy network set to analytically computed values for # faster training (rough estimates for RL to fine-tune). if policy is None or baseline is None: target_velocity = vv['target_velocity'] target_steering = 0 output_mean = np.array([target_velocity, target_steering]) hidden_sizes = (32, 32) # In mean network, allow output b values to dominate final output # value by constraining the magnitude of the output W matrix. This is # to allow faster learning. These numbers are arbitrarily chosen. W_gain = min(vv['target_velocity'] / 5, np.pi / 15) mean_network = MLP(input_shape=(env.spec.observation_space.flat_dim, ), output_dim=env.spec.action_space.flat_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=LN.tanh, output_nonlinearity=None, output_W_init=LI.GlorotUniform(gain=W_gain), output_b_init=output_mean) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32), init_std=init_std, mean_network=mean_network) baseline = LinearFeatureBaseline(env_spec=env.spec, target_key='returns') # Reset variance to re-enable exploration when using pre-trained networks else: policy._l_log_std = ParamLayer( policy._mean_network.input_layer, num_units=env.spec.action_space.flat_dim, param=LI.Constant(np.log(init_std)), name='output_log_std', trainable=True) obs_var = policy._mean_network.input_layer.input_var mean_var, log_std_var = L.get_output( [policy._l_mean, policy._l_log_std]) policy._log_std_var = log_std_var LasagnePowered.__init__(policy, [policy._l_mean, policy._l_log_std]) policy._f_dist = ext.compile_function(inputs=[obs_var], outputs=[mean_var, log_std_var]) safety_baseline = LinearFeatureBaseline(env_spec=env.spec, target_key='safety_returns') safety_constraint = StraightSafetyConstraint(max_value=1.0, baseline=safety_baseline) if vv['algo'] == 'TRPO': algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=600, max_path_length=env.horizon, n_itr=600, discount=0.99, step_size=trpo_stepsize, plot=False, ) else: algo = CPO(env=env, policy=policy, baseline=baseline, safety_constraint=safety_constraint, batch_size=600, max_path_length=env.horizon, n_itr=600, discount=0.99, step_size=trpo_stepsize, gae_lambda=0.95, safety_gae_lambda=1, optimizer_args={'subsample_factor': trpo_subsample_factor}, plot=False) algo.train()
mdp = NormalizedEnv(CartpoleSwingupEnvX()) for seed in seeds: policy = GaussianMLPPolicy( env_spec=mdp.spec, hidden_sizes=(64, 32), ) baseline = LinearFeatureBaseline(mdp.spec, ) batch_size = 50000 algo = TRPO( env=mdp, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=500, n_itr=10000, step_size=0.01, subsample_factor=1.0, ) run_experiment_lite(algo.train(), exp_prefix="trpo", n_parallel=4, snapshot_mode="last", seed=seed, mode="local")
def run_task(_): """Implement the run_task method needed to run experiments with rllab.""" sumo_params = SumoParams(sumo_binary="sumo-gui", sim_step=0.2, restart_instance=True) # RL vehicles constitute 5% of the total number of vehicles vehicles = Vehicles() vehicles.add(veh_id="human", acceleration_controller=(IDMController, { "noise": 0.2 }), speed_mode="no_collide", num_vehicles=5) vehicles.add(veh_id="rl", acceleration_controller=(RLController, {}), speed_mode="no_collide", num_vehicles=0) # Vehicles are introduced from both sides of merge, with RL vehicles # entering from the highway portion as well inflow = InFlows() inflow.add(veh_type="human", edge="inflow_highway", vehs_per_hour=(1 - RL_PENETRATION) * FLOW_RATE, departLane="free", departSpeed=10) inflow.add(veh_type="rl", edge="inflow_highway", vehs_per_hour=RL_PENETRATION * FLOW_RATE, departLane="free", departSpeed=10) inflow.add(veh_type="human", edge="inflow_merge", vehs_per_hour=100, departLane="free", departSpeed=7.5) additional_env_params = { "target_velocity": 25, "num_rl": NUM_RL, "max_accel": 1.5, "max_decel": 1.5 } env_params = EnvParams(horizon=HORIZON, sims_per_step=5, warmup_steps=0, additional_params=additional_env_params) additional_net_params = ADDITIONAL_NET_PARAMS.copy() additional_net_params["merge_lanes"] = 1 additional_net_params["highway_lanes"] = 1 additional_net_params["pre_merge_length"] = 500 net_params = NetParams(in_flows=inflow, no_internal_links=False, additional_params=additional_net_params) initial_config = InitialConfig(spacing="uniform", lanes_distribution=float("inf")) scenario = MergeScenario(name="merge-rl", generator_class=MergeGenerator, vehicles=vehicles, net_params=net_params, initial_config=initial_config) env_name = "WaveAttenuationMergePOEnv" pass_params = (env_name, sumo_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) env = normalize(env) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=HORIZON * N_ROLLOUTS, max_path_length=HORIZON, n_itr=1000, # whole_paths=True, discount=0.999, ) algo.train(),
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(Arm3dKeyEnv(ctrl_cost_coeff=v['ctrl_cost_coeff'])) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) fixed_start_generator = FixedStateGenerator(state=v['start_goal']) env = GoalStartExplorationEnv( env=inner_env, start_generator=fixed_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[-1 * v['goal_size']: ], # the goal are the last 9 coords terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], inner_weight=v['inner_weight'], goal_weight=v['goal_weight'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=v['policy_hidden_sizes'], # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) if v['baseline'] == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) elif v['baseline'] == 'g_mlp': baseline = GaussianMLPBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) # load the state collection from data_upload load_dir = 'data_upload/state_collections/' all_feasible_starts = pickle.load( open( osp.join(config.PROJECT_PATH, load_dir, 'all_feasible_states.pkl'), 'rb')) # all_feasible_starts = pickle.load( # open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_04_230000.pkl'), 'rb')) # all_feasible_starts = pickle.load( # open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_states_med_rad4.pkl'), 'rb')) all_feasible_starts2 = pickle.load( open( osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_states_min_rad4.pkl'), 'rb')) all_feasible_starts3 = pickle.load( open( osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_states_max_rad2.pkl'), 'rb')) print("we have %d feasible starts" % all_feasible_starts.size) all_starts = StateCollection(distance_threshold=v['coll_eps']) brownian_starts = StateCollection( distance_threshold=v['regularize_starts']) logger.log( 'Generating seed starts from the goal (horizon 10, subsample 600 of them)' ) with algo.env.set_kill_outside(radius=v['kill_radius']): seed_starts = generate_starts( env, starts=[v['start_goal']], horizon=10, # this is smaller as they are seeds! variance=v['brownian_variance'], subsample=v['num_new_starts']) # , animated=True, speedup=10) # seed_starts = all_feasible_starts.states # with env.set_kill_outside(radius=0.4): # find_all_feasible_states(env, seed_starts, distance_threshold=0.1, brownian_variance=1, animate=False) # # show where these states are: # shuffled_starts = np.array(all_feasible_starts.state_list) # np.random.shuffle(shuffled_starts) # generate_starts(env, starts=shuffled_starts, horizon=100, variance=v['brownian_variance'], # zero_action=True, animated=True, speedup=10) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") with algo.env.set_kill_outside(radius=v['kill_radius']): starts = generate_starts(algo.env, starts=seed_starts, horizon=v['brownian_horizon'], variance=v['brownian_variance']) # regularization of the brownian starts brownian_starts.empty() brownian_starts.append(starts) starts = brownian_starts.sample(size=v['num_new_starts']) if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, old_starts]) with ExperimentLogger(log_dir, 50 * (outer_iter // 50 + 1), snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") algo.env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) # algo.start_worker() logger.log("Training the algorithm") algo.current_itr = 0 trpo_paths = algo.train(already_init=outer_iter > 1) # import pdb; pdb.set_trace() if v['use_trpo_paths']: logger.log("labeling starts with trpo rollouts") [starts, labels] = label_states_from_paths( trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj as_goal=False, env=algo.env) paths = [path for paths in trpo_paths for path in paths] else: logger.log("labeling starts manually") labels, paths = label_states(starts, algo.env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached', full_path=True) with logger.tabular_prefix("OnStarts_"): algo.env.log_diagnostics(paths) logger.record_tabular('brownian_starts', brownian_starts.size) start_classes, text_labels = convert_label(labels) total_starts = labels.shape[0] logger.record_tabular('GenStarts_evaluated', total_starts) start_class_frac = OrderedDict( ) # this needs to be an ordered dict!! (for the log tabular) for k in text_labels.keys(): frac = np.sum(start_classes == k) / total_starts logger.record_tabular('GenStart_frac_' + text_labels[k], frac) start_class_frac[text_labels[k]] = frac labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.log("Labeling on uniform starts") with logger.tabular_prefix("Uniform_4med_"): unif_starts = all_feasible_starts.sample(500) unif_starts = np.pad(unif_starts, ((0, v['start_size'] - unif_starts.shape[1])), 'constant') mean_reward, paths = evaluate_states(unif_starts, algo.env, policy, v['horizon'], n_traj=1, key='goal_reached', as_goals=False, full_path=True) algo.env.log_diagnostics(paths) # with logger.tabular_prefix("Uniform_4med_bis_"): # unif_starts = all_feasible_starts.sample(200) # unif_starts1bis = np.pad(unif_starts, ((0, v['start_size'] - unif_starts.shape[1])), 'constant') # mean_reward1bis, paths1bis = evaluate_states(unif_starts1bis, algo.env, policy, v['horizon'], n_traj=1, # key='goal_reached', as_goals=False, full_path=True) # algo.env.log_diagnostics(paths1bis) # with logger.tabular_prefix("Uniform_4min_"): # unif_starts2 = all_feasible_starts2.sample(200) # unif_starts2 = np.pad(unif_starts2, ((0, v['start_size'] - unif_starts2.shape[1])), 'constant') # mean_reward2, paths2 = evaluate_states(unif_starts2, algo.env, policy, v['horizon'], n_traj=1, # key='goal_reached', as_goals=False, full_path=True) # algo.env.log_diagnostics(paths2) # with logger.tabular_prefix("Uniform_2max_"): # unif_starts3 = all_feasible_starts3.sample(200) # unif_starts3 = np.pad(unif_starts3, ((0, v['start_size'] - unif_starts3.shape[1])), 'constant') # mean_reward3, paths3 = evaluate_states(unif_starts3, algo.env, policy, v['horizon'], n_traj=1, # key='goal_reached', as_goals=False, full_path=True) # algo.env.log_diagnostics(paths3) logger.dump_tabular(with_prefix=True) # append new states to list of all starts (replay buffer): logger.log("Appending good goals to replay and generating seeds") filtered_raw_starts = [ start for start, label in zip(starts, labels) if label[0] == 1 ] all_starts.append(filtered_raw_starts) if v['seed_with'] == 'only_goods': if len(filtered_raw_starts) > 0: seed_starts = filtered_raw_starts elif np.sum(start_classes == 0) > np.sum( start_classes == 1): # if more low reward than high reward seed_starts = all_starts.sample( 300) # sample them from the replay else: # add a tone of noise if all the states I had ended up being high_reward! with algo.env.set_kill_outside(radius=v['kill_radius']): seed_starts = generate_starts( algo.env, starts=starts, horizon=int(v['horizon'] * 10), subsample=v['num_new_starts'], variance=v['brownian_variance'] * 10) elif v['seed_with'] == 'all_previous': all_starts.append(starts) seed_starts = starts elif v['seed_with'] == 'on_policy': with algo.env.set_kill_outside(radius=v['kill_radius']): seed_starts = generate_starts(algo.env, policy, horizon=v['horizon'], subsample=v['num_new_starts'])
hidden_sizes=(42, 42)) baseline = LinearFeatureBaseline(env_spec=env.spec) vg = instrument.VariantGenerator() vg.add("seed", [1, 2, 3, 4, 5]) variants = vg.variants() for variant in variants: algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=90000, max_path_length=100, n_itr=100, discount=0.99, step_size=0.1, optimizer_args={'cg_iters': 100}, plot=True, ) run_experiment_lite( algo.train(), # Number of parallel workers for sampling n_parallel=8, plot=True, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used,
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! if log_dir is None: log_dir = "/home/davheld/repos/rllab_goal_rl/data/local/debug" debug = True else: debug = False report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=5) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'])) uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'], center=v['goal_center']) env = GoalExplorationEnv( env=inner_env, goal_generator=uniform_goal_generator, #obs2goal_transform=lambda x: x[:int(len(x) / 2)], obs2goal_transform=lambda x: x[:v['goal_size']], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], goal_weight=v['goal_weight'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) outer_iter = 0 if not debug and not v['fast_mode']: logger.log('Generating the Initial Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center']) report.new_row() sagg_riac = SaggRIAC(state_size=v['goal_size'], state_range=v['goal_range'], state_center=v['goal_center'], max_goals=v['max_goals'], max_history=v['max_history']) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) raw_goals = sagg_riac.sample_states(num_samples=v['num_new_goals']) goals = raw_goals with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment goal generator") env.update_goal_generator( UniformListStateGenerator( goals, persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) all_paths = algo.train() if v['use_competence_ratio']: [goals, rewards ] = compute_rewards_from_paths(all_paths, key='competence', as_goal=True, env=env, terminal_eps=v['terminal_eps']) else: [goals, rewards] = compute_rewards_from_paths(all_paths, key='rewards', as_goal=True, env=env) [goals_with_labels, labels] = label_states_from_paths(all_paths, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(goals_with_labels, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) logger.log('Generating the Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center']) sagg_riac.plot_regions_interest(maze_id=v['maze_id'], report=report) sagg_riac.plot_regions_states(maze_id=v['maze_id'], report=report) logger.log("Updating SAGG-RIAC") sagg_riac.add_states(goals, rewards) # Find final states "accidentally" reached by the agent. final_goals = compute_final_states_from_paths(all_paths, as_goal=True, env=env) sagg_riac.add_accidental_states(final_goals, v['extend_dist_rew']) logger.dump_tabular(with_prefix=False) report.new_row()
# Created by Xingyu Lin, 10/06/2018 from rllab.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from envs.square2d.square2d_nongoal import Square2dEnv from rllab.envs.normalized_env import normalize from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.misc.instrument import run_experiment_lite env = normalize(Square2dEnv()) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=1000, discount=0.99, step_size=0.01, ) algo.train()
env = normalize(CartpoleEnv()) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=step_size, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) run_experiment_lite( algo.train(), exp_prefix="first_exp", # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used
stub(globals()) # Param ranges seeds = range(2) # SwimmerGather hierarchical task mdp_classes = [SwimmerGatherEnv] mdps = [NormalizedEnv(env=mdp_class()) for mdp_class in mdp_classes] param_cart_product = itertools.product(mdps, seeds) for mdp, seed in param_cart_product: policy = GaussianMLPPolicy(env_spec=mdp.spec, hidden_sizes=(64, 32)) baseline = LinearFeatureBaseline(mdp.spec) batch_size = 50000 algo = TRPO( env=mdp, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=500, n_itr=10000, step_size=0.01, subsample_factor=1.0, ) run_experiment_lite(algo.train(), exp_prefix="trpo", n_parallel=4, snapshot_mode="last", seed=seed, mode="local")
"mean_network": None, "hidden_sizes": (100, 50, 25), "hidden_nonlinearity": NL.tanh, "optimizer": base_line_optimizer, "use_trust_region": True, "step_size": 0.01, "learn_std": True, "init_std": 1.0, "adaptive_std": False, "std_share_network": False, "std_hidden_sizes": (32, 32), "std_nonlinearity": None, "normalize_inputs": True, "normalize_outputs": True, }) algo = TRPO( env=env, policy=policy, baseline=baseline, n_itr=total_iter, max_path_length=max_path_length, experiment_spec=experiment_spec, save_policy_every=save_policy_every, batch_size=batch_size, discount=0.995, gae_lambda=0.98, ) algo.train(),
from rllab.policies.gaussian_gru_policy import GaussianGRUPolicy from rllab.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp from rllab.misc.instrument import stub, run_experiment_lite stub(globals()) env = normalize(CartpoleEnv()) policy = GaussianGRUPolicy( env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=10, discount=0.99, step_size=0.01, optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) ) run_experiment_lite( algo.train(), n_parallel=1, seed=1, )
def main(): now = datetime.datetime.now(dateutil.tz.tzlocal()) rand_id = str(uuid.uuid4())[:5] timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) parser = argparse.ArgumentParser() parser.add_argument( '--exp_name', type=str, default=default_exp_name, help='Name of the experiment.') parser.add_argument('--discount', type=float, default=0.99) parser.add_argument('--gae_lambda', type=float, default=1.0) parser.add_argument('--reward_scale', type=float, default=1.0) parser.add_argument('--n_iter', type=int, default=250) parser.add_argument('--sampler_workers', type=int, default=1) parser.add_argument('--max_traj_len', type=int, default=250) parser.add_argument('--update_curriculum', action='store_true', default=False) parser.add_argument('--n_timesteps', type=int, default=8000) parser.add_argument('--control', type=str, default='centralized') parser.add_argument('--rectangle', type=str, default='10,10') parser.add_argument('--map_type', type=str, default='rectangle') parser.add_argument('--n_evaders', type=int, default=5) parser.add_argument('--n_pursuers', type=int, default=2) parser.add_argument('--obs_range', type=int, default=3) parser.add_argument('--n_catch', type=int, default=2) parser.add_argument('--urgency', type=float, default=0.0) parser.add_argument('--pursuit', dest='train_pursuit', action='store_true') parser.add_argument('--evade', dest='train_pursuit', action='store_false') parser.set_defaults(train_pursuit=True) parser.add_argument('--surround', action='store_true', default=False) parser.add_argument('--constraint_window', type=float, default=1.0) parser.add_argument('--sample_maps', action='store_true', default=False) parser.add_argument('--map_file', type=str, default='../maps/map_pool.npy') parser.add_argument('--flatten', action='store_true', default=False) parser.add_argument('--reward_mech', type=str, default='global') parser.add_argument('--catchr', type=float, default=0.1) parser.add_argument('--term_pursuit', type=float, default=5.0) parser.add_argument('--recurrent', type=str, default=None) parser.add_argument('--policy_hidden_sizes', type=str, default='128,128') parser.add_argument('--baselin_hidden_sizes', type=str, default='128,128') parser.add_argument('--baseline_type', type=str, default='linear') parser.add_argument('--conv', action='store_true', default=False) parser.add_argument('--max_kl', type=float, default=0.01) parser.add_argument('--log_dir', type=str, required=False) parser.add_argument('--tabular_log_file', type=str, default='progress.csv', help='Name of the tabular log file (in csv).') parser.add_argument('--text_log_file', type=str, default='debug.log', help='Name of the text log file (in pure text).') parser.add_argument('--params_log_file', type=str, default='params.json', help='Name of the parameter log file (in json).') parser.add_argument('--seed', type=int, help='Random seed for numpy') parser.add_argument('--args_data', type=str, help='Pickled data for stub objects') parser.add_argument('--snapshot_mode', type=str, default='all', help='Mode to save the snapshot. Can be either "all" ' '(all iterations will be saved), "last" (only ' 'the last iteration will be saved), or "none" ' '(do not save snapshots)') parser.add_argument('--log_tabular_only', type=ast.literal_eval, default=False, help='Whether to only print the tabular log information (in a horizontal format)') args = parser.parse_args() parallel_sampler.initialize(n_parallel=args.sampler_workers) if args.seed is not None: set_seed(args.seed) parallel_sampler.set_seed(args.seed) args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(','))) if args.sample_maps: map_pool = np.load(args.map_file) else: if args.map_type == 'rectangle': env_map = TwoDMaps.rectangle_map(*map(int, args.rectangle.split(','))) elif args.map_type == 'complex': env_map = TwoDMaps.complex_map(*map(int, args.rectangle.split(','))) else: raise NotImplementedError() map_pool = [env_map] env = PursuitEvade(map_pool, n_evaders=args.n_evaders, n_pursuers=args.n_pursuers, obs_range=args.obs_range, n_catch=args.n_catch, train_pursuit=args.train_pursuit, urgency_reward=args.urgency, surround=args.surround, sample_maps=args.sample_maps, constraint_window=args.constraint_window, flatten=args.flatten, reward_mech=args.reward_mech, catchr=args.catchr, term_pursuit=args.term_pursuit) env = RLLabEnv( StandardizedEnv(env, scale_reward=args.reward_scale, enable_obsnorm=False), mode=args.control) if args.recurrent: if args.conv: feature_network = ConvNetwork( input_shape=emv.spec.observation_space.shape, output_dim=5, conv_filters=(8,16,16), conv_filter_sizes=(3,3,3), conv_strides=(1,1,1), conv_pads=('VALID','VALID','VALID'), hidden_sizes=(64,), hidden_nonlinearity=NL.rectify, output_nonlinearity=NL.softmax) else: feature_network = MLP( input_shape=(env.spec.observation_space.flat_dim + env.spec.action_space.flat_dim,), output_dim=5, hidden_sizes=(128,128,128), hidden_nonlinearity=NL.tanh, output_nonlinearity=None) if args.recurrent == 'gru': policy = CategoricalGRUPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden_sizes)) elif args.conv: feature_network = ConvNetwork( input_shape=env.spec.observation_space.shape, output_dim=5, conv_filters=(8,16,16), conv_filter_sizes=(3,3,3), conv_strides=(1,1,1), conv_pads=('valid','valid','valid'), hidden_sizes=(64,), hidden_nonlinearity=NL.rectify, output_nonlinearity=NL.softmax) policy = CategoricalMLPPolicy(env_spec=env.spec, prob_network=feature_network) else: policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=args.hidden_sizes) if args.baseline_type == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) else: baseline = ZeroBaseline(obsfeat_space) # logger default_log_dir = config.LOG_DIR if args.log_dir is None: log_dir = osp.join(default_log_dir, args.exp_name) else: log_dir = args.log_dir tabular_log_file = osp.join(log_dir, args.tabular_log_file) text_log_file = osp.join(log_dir, args.text_log_file) params_log_file = osp.join(log_dir, args.params_log_file) logger.log_parameters_lite(params_log_file, args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(args.snapshot_mode) logger.set_log_tabular_only(args.log_tabular_only) logger.push_prefix("[%s] " % args.exp_name) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=args.n_timesteps, max_path_length=args.max_traj_len, n_itr=args.n_iter, discount=args.discount, gae_lambda=args.gae_lambda, step_size=args.max_kl, mode=args.control,) algo.train()
def run_task(*_): """Implement the run_task method needed to run experiments with rllab.""" v_enter = 10 inner_length = 300 long_length = 100 short_length = 300 n = 3 m = 3 num_cars_left = 1 num_cars_right = 1 num_cars_top = 1 num_cars_bot = 1 tot_cars = (num_cars_left + num_cars_right) * m \ + (num_cars_bot + num_cars_top) * n grid_array = { "short_length": short_length, "inner_length": inner_length, "long_length": long_length, "row_num": n, "col_num": m, "cars_left": num_cars_left, "cars_right": num_cars_right, "cars_top": num_cars_top, "cars_bot": num_cars_bot } sumo_params = SumoParams(sim_step=1, render=True) vehicles = Vehicles() vehicles.add(veh_id="idm", acceleration_controller=(SumoCarFollowingController, {}), sumo_car_following_params=SumoCarFollowingParams( min_gap=2.5, tau=1.1, max_speed=v_enter), routing_controller=(GridRouter, {}), num_vehicles=tot_cars, speed_mode="all_checks") tl_logic = TrafficLights(baseline=False) additional_env_params = { "target_velocity": 50, "switch_time": 3.0, "num_observed": 2, "discrete": False, "tl_type": "controlled" } env_params = EnvParams(additional_params=additional_env_params) additional_net_params = { "speed_limit": 35, "grid_array": grid_array, "horizontal_lanes": 1, "vertical_lanes": 1 } initial_config, net_params = get_flow_params(10, 300, n, m, additional_net_params) scenario = SimpleGridScenario(name="grid-intersection", vehicles=vehicles, net_params=net_params, initial_config=initial_config, traffic_lights=tl_logic) env_name = "PO_TrafficLightGridEnv" pass_params = (env_name, sumo_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=40000, max_path_length=horizon, # whole_paths=True, n_itr=800, discount=0.999, # step_size=0.01, ) algo.train()
) baseline = LinearFeatureBaseline(env_spec=env.spec) if DEBUG: n_itr = 5 else: n_itr = config.num_iter algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=config.batch_size, max_path_length=env.horizon, n_itr=n_itr, discount=config.discount, step_size=config.step_size, gae_lambda=config.gae_lambda, num_workers=config.num_workers, plot_learning_curve=config.plot_learning_curve, trial=agent_num, ) avg_rewards, std_rewards = algo.train() print("training completed!") saveModel(algo.policy, 'policy_{}_config_{}_agent_{}'.format(dynamic_environments[args.env_ind], args.config_num, agent_num)) # save rewards per model over the iterations # also plot the rewards if config.plot_learning_curve:
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) tf_session = tf.Session() inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'])) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'], center=v['start_center']) env = GoalStartExplorationEnv( env=inner_env, start_generator=uniform_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[:v['goal_size']], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['start_range'], center=v['start_center']) # test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], # itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center']) # GAN logger.log("Instantiating the GAN...") gan_configs = {key[4:]: value for key, value in v.items() if 'GAN_' in key} for key, value in gan_configs.items(): if value is tf.train.AdamOptimizer: gan_configs[key] = tf.train.AdamOptimizer(gan_configs[key + '_stepSize']) if value is tflearn.initializations.truncated_normal: gan_configs[key] = tflearn.initializations.truncated_normal( stddev=gan_configs[key + '_stddev']) gan = StateGAN( state_size=v['start_size'], evaluater_size=v['num_labels'], state_range=v['start_range'], state_center=v['start_center'], state_noise_level=v['start_noise_level'], generator_layers=v['gan_generator_layers'], discriminator_layers=v['gan_discriminator_layers'], noise_size=v['gan_noise_size'], tf_session=tf_session, configs=gan_configs, ) logger.log("pretraining the GAN...") if v['smart_init']: feasible_starts = generate_starts( env, starts=[v['ultimate_goal']], horizon=50) # without giving the policy it does brownian mo. labels = np.ones((feasible_starts.shape[0], 2)).astype(np.float32) # make them all good goals plot_labeled_states(feasible_starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) dis_loss, gen_loss = gan.pretrain(states=feasible_starts, outer_iters=v['gan_outer_iters']) print("Loss of Gen and Dis: ", gen_loss, dis_loss) else: gan.pretrain_uniform(outer_iters=500, report=report) # v['gan_outer_iters']) # log first samples form the GAN initial_starts, _ = gan.sample_states_with_noise(v['num_new_starts']) logger.log("Labeling the starts") labels = label_states(initial_starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(initial_starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) report.new_row() all_starts = StateCollection(distance_threshold=v['coll_eps']) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) # Sample GAN logger.log("Sampling starts from the GAN") raw_starts, _ = gan.sample_states_with_noise(v['num_new_starts']) if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([raw_starts, old_starts]) else: starts = raw_starts with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) trpo_paths = algo.train() if v['use_trpo_paths']: logger.log("labeling starts with trpo rollouts") [starts, labels] = label_states_from_paths( trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj as_goal=False, env=env) paths = [path for paths in trpo_paths for path in paths] else: logger.log("labeling starts manually") labels, paths = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached', full_path=True) with logger.tabular_prefix("OnStarts_"): env.log_diagnostics(paths) plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) logger.log('Generating the Heatmap...') plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['start_range'], center=v['start_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center']) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.log("Training the GAN") if np.any(labels): gan.train( starts, labels, v['gan_outer_iters'], ) logger.dump_tabular(with_prefix=False) report.new_row() # append new goals to list of all goals (replay buffer): Not the low reward ones!! filtered_raw_start = [ start for start, label in zip(starts, labels) if label[0] == 1 ] all_starts.append(filtered_raw_start)
def train(num_experiments, thread_id, queue): ############ DEFAULT PARAMETERS ############ env_name = None #Name of adversarial environment path_length = 1000 #Maximum episode length layer_size = tuple([100, 100, 100]) #Layer definition ifRender = False #Should we render? afterRender = 100 #After how many to animate n_exps = 1 #Number of training instances to run n_itr = 25 #Number of iterations of the alternating optimization n_pro_itr = 1 #Number of iterations for the protaginist n_adv_itr = 1 #Number of interations for the adversary batch_size = 4000 #Number of training samples for each iteration ifSave = True #Should we save? save_every = 100 #Save checkpoint every save_every iterations n_process = 1 #Number of parallel threads for sampling environment adv_fraction = 0.25 #Fraction of maximum adversarial force to be applied step_size = 0.01 #kl step size for TRPO gae_lambda = 0.97 #gae_lambda for learner save_dir = './results' #folder to save result in ############ ENV SPECIFIC PARAMETERS ############ env_name = 'Walker2dAdv-v1' layer_size = tuple([64, 64]) step_size = 0.1 gae_lambda = 0.97 batch_size = 25000 n_exps = num_experiments n_itr = 500 ifSave = False n_process = 4 adv_fraction = 5.0 save_dir = './../results/StaticWalker' args = [ env_name, path_length, layer_size, ifRender, afterRender, n_exps, n_itr, n_pro_itr, n_adv_itr, batch_size, save_every, n_process, adv_fraction, step_size, gae_lambda, save_dir ] ############ ADVERSARIAL POLICY LOAD ############ filepath = './../initial_results/Walker/env-Walker2dAdv-v1_Exp1_Itr1500_BS25000_Adv0.25_stp0.01_lam0.97_507500.p' res_D = pickle.load(open(filepath, 'rb')) pretrained_adv_policy = res_D['adv_policy'] ############ MAIN LOOP ############ ## Initializing summaries for the tests ## const_test_rew_summary = [] rand_test_rew_summary = [] step_test_rew_summary = [] rand_step_test_rew_summary = [] adv_test_rew_summary = [] ## Preparing file to save results in ## save_prefix = 'static_env-{}_Exp{}_Itr{}_BS{}_Adv{}_stp{}_lam{}_{}'.format( env_name, n_exps, n_itr, batch_size, adv_fraction, step_size, gae_lambda, random.randint(0, 1000000)) save_name = save_dir + '/' + save_prefix ## Looping over experiments to carry out ## for ne in range(n_exps): ## Environment definition ## ## The second argument in GymEnv defines the relative magnitude of adversary. For testing we set this to 1.0. env = normalize(GymEnv(env_name, adv_fraction)) env_orig = normalize(GymEnv(env_name, 1.0)) ## Protagonist policy definition ## pro_policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=layer_size, is_protagonist=True) pro_baseline = LinearFeatureBaseline(env_spec=env.spec) ## Zero Adversary for the protagonist training ## zero_adv_policy = ConstantControlPolicy(env_spec=env.spec, is_protagonist=False, constant_val=0.0) ## Adversary policy definition ## adv_policy = pretrained_adv_policy adv_baseline = LinearFeatureBaseline(env_spec=env.spec) ## Initializing the parallel sampler ## parallel_sampler.initialize(n_process) ## Optimizer for the Protagonist ## pro_algo = TRPO(env=env, pro_policy=pro_policy, adv_policy=adv_policy, pro_baseline=pro_baseline, adv_baseline=adv_baseline, batch_size=batch_size, max_path_length=path_length, n_itr=n_pro_itr, discount=0.995, gae_lambda=gae_lambda, step_size=step_size, is_protagonist=True) ## Setting up summaries for testing for a specific training instance ## pro_rews = [] adv_rews = [] all_rews = [] const_testing_rews = [] const_testing_rews.append( test_const_adv(env_orig, pro_policy, path_length=path_length)) rand_testing_rews = [] rand_testing_rews.append( test_rand_adv(env_orig, pro_policy, path_length=path_length)) step_testing_rews = [] step_testing_rews.append( test_step_adv(env_orig, pro_policy, path_length=path_length)) rand_step_testing_rews = [] rand_step_testing_rews.append( test_rand_step_adv(env_orig, pro_policy, path_length=path_length)) adv_testing_rews = [] adv_testing_rews.append( test_learnt_adv(env, pro_policy, adv_policy, path_length=path_length)) ## Beginning alternating optimization ## for ni in range(n_itr): logger.log('\n\nThread: {} Experiment: {} Iteration: {}\n'.format( thread_id, ne, ni, )) ## Train Protagonist pro_algo.train() pro_rews += pro_algo.rews all_rews += pro_algo.rews logger.log('Protag Reward: {}'.format( np.array(pro_algo.rews).mean())) ## Test the learnt policies const_testing_rews.append( test_const_adv(env, pro_policy, path_length=path_length)) rand_testing_rews.append( test_rand_adv(env, pro_policy, path_length=path_length)) step_testing_rews.append( test_step_adv(env, pro_policy, path_length=path_length)) rand_step_testing_rews.append( test_rand_step_adv(env, pro_policy, path_length=path_length)) adv_testing_rews.append( test_learnt_adv(env, pro_policy, adv_policy, path_length=path_length)) if ni % afterRender == 0 and ifRender == True: test_const_adv(env, pro_policy, path_length=path_length, n_traj=1, render=True) if ni != 0 and ni % save_every == 0 and ifSave == True: ## SAVING CHECKPOINT INFO ## pickle.dump( { 'args': args, 'pro_policy': pro_policy, 'adv_policy': adv_policy, 'zero_test': [const_testing_rews], 'rand_test': [rand_testing_rews], 'step_test': [step_testing_rews], 'rand_step_test': [rand_step_testing_rews], 'iter_save': ni, 'exp_save': ne, 'adv_test': [adv_testing_rews] }, open(save_name + '_' + str(ni) + '.p', 'wb')) ## Shutting down the optimizer ## pro_algo.shutdown_worker() ## Updating the test summaries over all training instances const_test_rew_summary.append(const_testing_rews) rand_test_rew_summary.append(rand_testing_rews) step_test_rew_summary.append(step_testing_rews) rand_step_test_rew_summary.append(rand_step_testing_rews) adv_test_rew_summary.append(adv_testing_rews) queue.put([ const_test_rew_summary, rand_test_rew_summary, step_test_rew_summary, rand_step_test_rew_summary, adv_test_rew_summary ]) ############ SAVING MODEL ############ '''
def run_task(*_): v_enter = 30 inner_length = 800 long_length = 100 short_length = 800 n = 1 m = 5 num_cars_left = 3 num_cars_right = 3 num_cars_top = 15 num_cars_bot = 15 tot_cars = (num_cars_left + num_cars_right) * m \ + (num_cars_bot + num_cars_top) * n grid_array = { "short_length": short_length, "inner_length": inner_length, "long_length": long_length, "row_num": n, "col_num": m, "cars_left": num_cars_left, "cars_right": num_cars_right, "cars_top": num_cars_top, "cars_bot": num_cars_bot } sumo_params = SumoParams(sim_step=1, sumo_binary="sumo-gui") vehicles = Vehicles() vehicles.add(veh_id="idm", acceleration_controller=(SumoCarFollowingController, {}), sumo_car_following_params=SumoCarFollowingParams( minGap=2.5, max_speed=v_enter, ), routing_controller=(GridRouter, {}), num_vehicles=tot_cars, speed_mode="all_checks") additional_env_params = { "target_velocity": 50, "num_steps": 500, "control-length": 150, "switch_time": 3.0 } env_params = EnvParams(additional_params=additional_env_params) additional_net_params = { "speed_limit": 35, "grid_array": grid_array, "horizontal_lanes": 1, "vertical_lanes": 1, "traffic_lights": True } initial_config, net_params = get_non_flow_params(10, additional_net_params) scenario = SimpleGridScenario(name="grid-intersection", generator_class=SimpleGridGenerator, vehicles=vehicles, net_params=net_params, initial_config=initial_config) env_name = "GreenWaveEnv" pass_params = (env_name, sumo_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=40000, max_path_length=horizon, # whole_paths=True, n_itr=800, discount=0.999, # step_size=0.01, ) algo.train()
from humanoidopt.env import HumanoidOptEnv from rllab.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.envs.normalized_env import normalize from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy env = normalize(HumanoidOptEnv()) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=0.01, ) algo.train()
copyparams.update(modeparams) copyparams['layer'] = layer mdp = normalize(GymEnv(params['env'], **copyparams)) for seed in seeds: policy = GaussianMLPPolicy(env_spec=mdp.spec, hidden_sizes=(32, 32), init_std=10) baseline = LinearFeatureBaseline(mdp.spec, ) batch_size = 50 * 250 algo = TRPO(env=mdp, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=50, n_itr=200, step_size=0.01, subsample_factor=1.0, **copyparams) run_experiment_lite( algo.train(), exp_prefix="r-inception-same-strike-std2", n_parallel=4, # dry=True, snapshot_mode="all", seed=seed, mode="ec2_mujoco", #terminate_machine=False )
def main(): now = datetime.datetime.now(dateutil.tz.tzlocal()) rand_id = str(uuid.uuid4())[:5] timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) parser = argparse.ArgumentParser() parser.add_argument( '--exp_name', type=str, default=default_exp_name, help='Name of the experiment.') parser.add_argument('--discount', type=float, default=0.95) parser.add_argument('--gae_lambda', type=float, default=0.99) parser.add_argument('--n_iter', type=int, default=250) parser.add_argument('--sampler_workers', type=int, default=1) parser.add_argument('--max_traj_len', type=int, default=250) parser.add_argument('--update_curriculum', action='store_true', default=False) parser.add_argument('--n_timesteps', type=int, default=8000) parser.add_argument('--control', type=str, default='centralized') parser.add_argument('--control', type=str, default='centralized') parser.add_argument('--buffer_size', type=int, default=1) parser.add_argument('--n_good', type=int, default=3) parser.add_argument('--n_hostage', type=int, default=5) parser.add_argument('--n_bad', type=int, default=5) parser.add_argument('--n_coop_save', type=int, default=2) parser.add_argument('--n_coop_avoid', type=int, default=2) parser.add_argument('--n_sensors', type=int, default=20) parser.add_argument('--sensor_range', type=float, default=0.2) parser.add_argument('--save_reward', type=float, default=3) parser.add_argument('--hit_reward', type=float, default=-1) parser.add_argument('--encounter_reward', type=float, default=0.01) parser.add_argument('--bomb_reward', type=float, default=-10.) parser.add_argument('--recurrent', action='store_true', default=False) parser.add_argument('--baseline_type', type=str, default='linear') parser.add_argument('--policy_hidden_sizes', type=str, default='128,128') parser.add_argument('--baselin_hidden_sizes', type=str, default='128,128') parser.add_argument('--max_kl', type=float, default=0.01) parser.add_argument('--log_dir', type=str, required=False) parser.add_argument('--tabular_log_file', type=str, default='progress.csv', help='Name of the tabular log file (in csv).') parser.add_argument('--text_log_file', type=str, default='debug.log', help='Name of the text log file (in pure text).') parser.add_argument('--params_log_file', type=str, default='params.json', help='Name of the parameter log file (in json).') parser.add_argument('--seed', type=int, help='Random seed for numpy') parser.add_argument('--args_data', type=str, help='Pickled data for stub objects') parser.add_argument('--snapshot_mode', type=str, default='all', help='Mode to save the snapshot. Can be either "all" ' '(all iterations will be saved), "last" (only ' 'the last iteration will be saved), or "none" ' '(do not save snapshots)') parser.add_argument('--log_tabular_only', type=ast.literal_eval, default=False, help='Whether to only print the tabular log information (in a horizontal format)') args = parser.parse_args() parallel_sampler.initialize(n_parallel=args.sampler_workers) if args.seed is not None: set_seed(args.seed) parallel_sampler.set_seed(args.seed) args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(','))) centralized = True if args.control == 'centralized' else False sensor_range = np.array(map(float, args.sensor_range.split(','))) assert sensor_range.shape == (args.n_pursuers,) env = ContinuousHostageWorld(args.n_good, args.n_hostage, args.n_bad, args.n_coop_save, args.n_coop_avoid, n_sensors=args.n_sensors, sensor_range=args.sensor_range, save_reward=args.save_reward, hit_reward=args.hit_reward, encounter_reward=args.encounter_reward, bomb_reward=args.bomb_reward) env = RLLabEnv(StandardizedEnv(env), mode=args.control) if args.buffer_size > 1: env = ObservationBuffer(env, args.buffer_size) if args.recurrent: policy = GaussianGRUPolicy(env_spec=env.spec, hidden_sizes=args.hidden_sizes) else: policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=args.hidden_sizes) if args.baseline_type == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) else: baseline = ZeroBaseline(obsfeat_space) # logger default_log_dir = config.LOG_DIR if args.log_dir is None: log_dir = osp.join(default_log_dir, args.exp_name) else: log_dir = args.log_dir tabular_log_file = osp.join(log_dir, args.tabular_log_file) text_log_file = osp.join(log_dir, args.text_log_file) params_log_file = osp.join(log_dir, args.params_log_file) logger.log_parameters_lite(params_log_file, args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(args.snapshot_mode) logger.set_log_tabular_only(args.log_tabular_only) logger.push_prefix("[%s] " % args.exp_name) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=args.n_timesteps, max_path_length=args.max_traj_len, n_itr=args.n_iter, discount=args.discount, step_size=args.max_kl, mode=args.control,) algo.train()
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] samples_per_cell = 10 # for the oracle rejection sampling # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'])) uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'], center=v['goal_center']) env = GoalExplorationEnv( env=inner_env, goal_generator=uniform_goal_generator, obs2goal_transform=lambda x: x[:int(len(x) / 2)], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) report.new_row() all_goals = StateCollection(distance_threshold=v['coll_eps']) total_rollouts = 0 for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling goals") goals = np.array([]).reshape((-1, v['goal_size'])) k = 0 while goals.shape[0] < v['num_new_goals']: print('good goals collected: ', goals.shape[0]) logger.log("Sampling and labeling the goals: %d" % k) k += 1 unif_goals = sample_unif_feas(env, samples_per_cell=samples_per_cell) labels = label_states(unif_goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached') logger.log("Converting the labels") init_classes, text_labels = convert_label(labels) goals = np.concatenate([goals, unif_goals[init_classes == 2]]).reshape( (-1, v['goal_size'])) if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0: old_goals = all_goals.sample( v['num_old_goals']) #todo: replay noise? goals = np.vstack([goals, old_goals]) with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment goal generator") env.update_goal_generator( UniformListStateGenerator( goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, plot=False, ) algo.train() logger.log('Generating the Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) logger.log("Labeling the goals") labels = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) # rollouts used for labeling (before TRPO itrs): num_empty_spaces = len(unwrap_maze(env).find_empty_space()) logger.record_tabular( 'LabelingRollouts', k * v['n_traj'] * samples_per_cell * num_empty_spaces) total_rollouts += k * v['n_traj'] * samples_per_cell * num_empty_spaces logger.record_tabular('TotalLabelingRollouts', total_rollouts) logger.dump_tabular(with_prefix=False) report.new_row() # append new goals to list of all goals (replay buffer): Not the low reward ones!! filtered_raw_goals = [ goal for goal, label in zip(goals, labels) if label[0] == 1 ] all_goals.append(filtered_raw_goals)
def run_task(*_): """Implement the run_task method needed to run experiments with rllab.""" sim_params = AimsunParams(sim_step=0.5, render=False, seed=0) vehicles = VehicleParams() vehicles.add(veh_id="rl", acceleration_controller=(RLController, {}), routing_controller=(ContinuousRouter, {}), num_vehicles=1) vehicles.add(veh_id="idm", acceleration_controller=(IDMController, {}), routing_controller=(ContinuousRouter, {}), num_vehicles=21) additional_env_params = { "target_velocity": 8, "ring_length": None, "max_accel": 1, "max_decel": 1 } env_params = EnvParams(horizon=HORIZON, additional_params=additional_env_params, warmup_steps=1500) additional_net_params = { "length": 230, "lanes": 1, "speed_limit": 30, "resolution": 40 } net_params = NetParams(additional_params=additional_net_params) initial_config = InitialConfig(spacing="uniform", bunching=50) print("XXX name", exp_tag) scenario = LoopScenario(exp_tag, vehicles, net_params, initial_config=initial_config) env_name = "WaveAttenuationPOEnv" simulator = 'aimsun' pass_params = (env_name, sim_params, vehicles, env_params, net_params, initial_config, scenario, simulator) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(3, 3), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=15000, max_path_length=horizon, n_itr=500, # whole_paths=True, discount=0.999, # step_size=v["step_size"], ) algo.train(),
env = normalize(CartpoleEnv()) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(100, 50, 25) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=50000, max_path_length=500, n_itr=500, discount=0.99, step_size=0.1, ) rets_per_episode_batchwise = algo.train() rets_per_episode = [x for lst in rets_per_episode_batchwise for x in lst] print('mean return over all episodes', np.mean(rets_per_episode)) plt.plot(rets_per_episode, alpha=0.3) plt.savefig('/tmp/upsi/test_rllab/trpo_cartpole.png')