def run_task(v): record_video = False import mujoco_envs.pomdp main_env = GymEnv('Peg3d-v0', record_video=record_video) # main_env = MultiagentEnv(GymEnv("Swimmer-v1", record_video=record_video)) # replace raw shadow_envs with wrapped envs main_env._shadow_envs = [TfEnv(ProxyEnv(env)) for env in main_env.shadow_envs] # main_env._shadow_envs = [TfEnv(normalize(env)) for env in main_env.shadow_envs] sub_policies = [AutoMLPPolicy( # sub_policies = [BottleneckAutoMLPPolicy( name="sub-policy-%s" % i, env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) # 32) ) for i,env in enumerate(main_env.shadow_envs)] # reduces the initialization, to discourage pre-commiting to an action # for sp in sub_policies: # import ipdb; ipdb.set_trace() # sp.get_params()[-3].set_value(sp.get_params()[-3].get_value()*0.01) policy = MultiMLPPolicy( name="policy", env_spec=[env.spec for env in main_env.shadow_envs], policies=sub_policies ) baselines = [LinearFeatureBaseline(env_spec=env.spec) for env in main_env.shadow_envs] # TODO(cathywu) Start with large batch sizes 100-1000 trajectories algo = MultiTRPO( env=main_env, policy=policy, baselines=baselines, batch_size=25000, whole_paths=True, max_path_length=250, n_itr=700, discount=0.995, step_size=v["step_size"], # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, # NPO_cls=ConsensusNPO, NPO_cls=NPO, sample_processor_cls=MultiSampleProcessor, n_vectorized_envs=40, ) algo.train()
def run_task(v): record_video = True import multiagent.envs as multiagent_envs # required for multiagent envs # main_env = GymEnv("MultiagentSimple-v0", record_video=record_video) # main_env = GymEnv("MultiagentSimpleIndependent-v0", record_video=record_video) main_env = GymEnv("MultiagentSimpleSpeakerListener-v0", record_video=record_video) # replace raw shadow_envs with wrapped envs main_env._shadow_envs = [ TfEnv(normalize(env)) for env in main_env.shadow_envs ] sub_policies = [ AutoMLPPolicy( name="sub-policy-%s" % i, env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) for i, env in enumerate(main_env.shadow_envs) ] policy = MultiMLPPolicy( name="policy", env_spec=[env.spec for env in main_env.shadow_envs], policies=sub_policies) baselines = [ LinearFeatureBaseline(env_spec=env.spec) for env in main_env.shadow_envs ] algo = MultiTRPO( env=main_env, policy=policy, baselines=baselines, batch_size=4000, max_path_length=100, n_itr=2, discount=0.99, step_size=v["step_size"], # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, sample_processor_cls=MultiSampleProcessor, n_vectorized_envs=2, ) algo.train()
def run_task(*_): env = normalize( GymEnv("DartWalker3d-v1", record_log=False, record_video=False)) policy = GaussianMLPAuxPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(100, 50, 25), aux_pred_step=3, aux_pred_dim=7, ) #policy = joblib.load('data/local/experiment/walker_aux/policy.pkl') baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPOAux( env=env, policy=policy, baseline=baseline, batch_size=50000, max_path_length=env.horizon, n_itr=500, discount=0.995, step_size=0.01, epopt_epsilon=1.0, epopt_after_iter=0, gae_lambda=0.97, aux_pred_step=3, aux_pred_dim=7, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def main(args): logger.set_snapshot_dir(args.snapshot_dir) logger.set_snapshot_mode("none") logger.add_tabular_output(os.path.join(args.snapshot_dir, "tabular.csv")) env = GymEnv(args.env_id) # Load the AI policy. with open(args.ai_policy, "rb") as f: env.env.unwrapped.ai_policy = pickle.load(f) # If the user provided a starting policy, use it. Otherwise, we start with # a fresh policy. if args.input_policy is not None: with open(args.input_policy, "rb") as f: policy = pickle.load(f) else: policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=args.hidden_sizes) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=args.batch_size, max_path_length=env.horizon, n_itr=args.n_itr, discount=args.discount, step_size=args.step_size, gae_lambda=args.gae_lambda, ) algo.train() with open(args.output_policy, "wb") as f: pickle.dump(policy, f)
def run_task(*_): n_itr = 1000 env = VaryMassEnv(GymEnv("MyPendulum-v0", record_video=False), m0=0.2, mf=0.3, iters=n_itr) # policy = GaussianMLP2Policy( name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes= (4,4), #(128, 128, 128, 128, 128, 128), hidden_nonlinearity=tf.nn.relu, #linearized_tanh # tf.nn.relu, # relu_tanh #output_nonlinearity=tf.nn.sigmoid # idea: define new tf nonlinearity that is a cap, made up of two relus ) # baseline = LinearFeatureBaseline(env_spec=env.spec) # algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=n_itr, discount=0.99, step_size=0.0075, # 0.01 sampler_cls=VectorizedVaryingSampler # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): """Implement the run_task method needed to run experiments with rllab.""" pass_params = (env_name, sumo_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) policy = GaussianGRUPolicy(env_spec=env.spec, hidden_sizes=(64, )) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=horizon * 32 * 2, max_path_length=horizon, # whole_paths=True, n_itr=400, discount=0.999, # step_size=0.01, ) algo.train()
def main(num_examples=50, discount=0.99): env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False)) experts = load_latest_experts('data/pendulum', n=num_examples) irl_model = GCLDiscrimTrajectory(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=200, batch_size=2000, max_path_length=100, discount=discount, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec)) with rllab_logdir(algo=algo, dirname='data/pendulum_traj'): with tf.Session(): algo.train()
def main(eval_reward = False): env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False)) n_experts = 10 experts = load_latest_experts('plotting/pendulum_final', n=n_experts) dirname='data/pendulum' # dir to save logs and images irl_model = AIRLStateAction(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=1000, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), eval_reward=True, fig_dir = dirname ) # with rllab_logdir(algo=algo, dirname='data/pendulum_gcl{}'.format(n_experts)): with rllab_logdir(algo=algo, dirname=dirname): with tf.Session(): algo.fig_dirname = dirname algo.train()
def run_task(_): env_name = "PlatooningEnv" register( id=env_name+'-v0', entry_point='platooning_env:{}'.format(env_name), max_episode_steps=HORIZON, kwargs={"env_params": ENV_PARAMS} ) env = GymEnv(env_name, record_video=False) horizon = env.horizon env = normalize(env) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(16, 16, 16), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=15000, max_path_length=horizon, n_itr=1000, # whole_paths=True, discount=0.999, ) algo.train(),
def main(exp_name, ent_wt=1.0): register_custom_envs() env_name = 'LunarLanderContinuous-v3' env = GymEnv(env_name) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=350, epoch_length=350, min_pool_size=350, n_epochs=600, discount=0.99, scale_reward=1.0/140.0, qf_learning_rate=1e-3, policy_learning_rate=1e-4, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) data_path = 'data/%s_data_rllab_%s/%s/'%(env_name.replace('-', '_'), str(algo.__class__.__name__), exp_name) os.makedirs(data_path, exist_ok=True) logger.set_snapshot_dir(data_path) algo.train() logger.set_snapshot_dir(None)
def run_task(*_): env = TfEnv( normalize(GymEnv("Reacher-v1", force_reset=True, record_video=True))) #env = TfEnv(normalize(PusherEnv())) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(128, 128)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=100 * 500, max_path_length=100, n_itr=200, discount=0.99, step_size=0.01, force_batch_sampler=True, # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) ) algo.train()
def main(): env = TfEnv( GymEnv('HRI_AirSim_Landing-v0', record_video=False, record_log=False)) experts = load_latest_experts('data/airsim', n=5) irl_model = AIRLStateAction(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=10, batch_size=100, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec)) with rllab_logdir(algo=algo, dirname='data/airsim_gcl'): with tf.Session(): algo.train()
def run_task(vv): env = TfEnv( normalize( GymEnv('HalfCheetah-v1', record_video=False, record_log=False))) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32), name="policy") baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=vv["step_size"], # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def test_fric_rob(test_type, file_name, env_name, fric_fractions=np.linspace(0.5, 1.5, 11), fric_bodies=[b'ffoot', b'bfoot'], adv_fraction=1.0, n_traj=5): fric_vals = [] test_rew_summary = [] test_rew_std_summary = [] print(file_name) res_D = pickle.load(open(file_name, 'rb')) P = res_D['pro_policy'] for ff in fric_fractions: env = normalize(GymEnv(env_name, 1.0)) e = np.array(env.wrapped_env.env.model.geom_friction) e = e * ff env.wrapped_env.env.model.geom_friction = e fric_vals.append(e[0, 0]) N = np.zeros(n_traj) for i in range(n_traj): N[i] = test_type(env, P, 1000, 1) M = N.mean() V = N.std() test_rew_summary.append(M) test_rew_std_summary.append(V) return test_rew_summary, test_rew_std_summary, fric_vals
def run_task(*_): env = normalize( GymEnv("DartHopper-v1", record_log=False, record_video=False)) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(128, 64), net_mode=0, ) print('trainable parameter size: ', policy.get_param_values(trainable=True).shape) baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0) algo = PPO_Clip_Sym( env=env, policy=policy, baseline=baseline, batch_size=20000, max_path_length=env.horizon, n_itr=200, discount=0.99, step_size=0.02, gae_lambda=0.97, whole_paths=False, observation_permutation=np.array( [0.0001, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), action_permutation=np.array([0.0001, 1, 2]), sym_loss_weight=0.0, ) algo.train()
def run_task(*_): # Please note that different environments with different action spaces may require different # policies. For example with a Box action space, a GaussianMLPPolicy works, but for a Discrete # action space may need to use a CategoricalMLPPolicy (see the trpo_gym_cartpole.py example) env = normalize(GymEnv("Pendulum-v0", record_video=False, force_reset=True)) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=50, discount=0.99, step_size=0.01, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5, symmetric=False)) # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): env = normalize(GymEnv("DartWalker3dRestricted-v1") ) #, record_log=False, record_video=False)) policy = GaussianHMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32), subnet_split1=[8, 9, 10, 11, 12, 13, 29, 30, 31, 32, 33, 34], subnet_split2=[14, 15, 16, 17, 18, 19, 35, 36, 37, 38, 39, 40], sub_out_dim=6, option_dim=2, hlc_output_dim=3, ) #policy = joblib.load('data/local/experiment/Walker3d_waist_onlyconcatoption3/policy.pkl') baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=50000, max_path_length=env.horizon, n_itr=500, discount=0.995, step_size=0.01, epopt_epsilon=1.0, epopt_after_iter=0, gae_lambda=0.97, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): # Please note that different environments with different action spaces may # require different policies. For example with a Discrete action space, a # CategoricalMLPPolicy works, but for a Box action space may need to use # a GaussianMLPPolicy (see the trpo_gym_pendulum.py example) env = normalize(GymEnv("CartPole-v0")) policy = CategoricalMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=50, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def main(exp_name, ent_wt=0.1, visible_gpus='0', discount=0.99): gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list=visible_gpus) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options) env = TfEnv(GymEnv('Swimmer-v3', record_video=False, record_log=False)) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) with tf.Session(config=tf_config) as sess: algo = TRPO( env=env, policy=policy, n_itr=3000, batch_size=20000, max_path_length=1000, discount=discount, store_paths=True, baseline=LinearFeatureBaseline(env_spec=env.spec), step_size=0.01, entropy_weight=ent_wt, sess=sess, exp_name=exp_name, ) with rllab_logdir(algo=algo, dirname='data/swimmer'): algo.train(sess)
def run_task(*_): env_name = "BottleneckEnv" pass_params = (env_name, sumo_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(100, 50, 25)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=20000, max_path_length=horizon, # whole_paths=True, n_itr=400, discount=0.995, # step_size=0.01, ) algo.train()
def run_task(*_): # Please note that different environments with different action spaces may require different # policies. For example with a Box action space, a GaussianMLPPolicy works, but for a Discrete # action space may need to use a CategoricalMLPPolicy (see the trpo_gym_cartpole.py example) env = TfEnv(GymEnv("MyPendulum-v1", record_video=False)) # policy = GaussianConvPolicy( name="policy", env_spec=env.spec, conv_filters = [3], # how many conv layers. e.g. this is one layer with 3 fitlers (I think) conv_filter_sizes = [5, 5, 5], conv_strides = [3, 3, 3], conv_pads = ['SAME', 'SAME', 'SAME'], # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes= (16,4), #(128, 128, 128, 128, 128, 128), hidden_nonlinearity=tf.nn.relu, #linearized_tanh output_nonlinearity=None, ) # baseline = LinearFeatureBaseline(env_spec=env.spec) # algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=5, #4000, max_path_length=env.horizon, n_itr=2, #1000, discount=0.99, step_size=0.0075, # 0.01 # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): # env = normalize(HalfCheetahEnv()) env = GymEnv(env_name = "MountainCarContinuous-v0", force_reset=True) # baseline = LinearFeatureBaseline(env_spec=env.spec) baseline = ZeroBaseline(env_spec=env.spec) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers hidden_sizes=(64, 64) ) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=100, max_path_length=100, n_itr=10000, discount=0.99, optimizer_args=dict( learning_rate=0.01, ) ) algo.train()
def run_task(*_): # Please note that different environments with different action spaces may # require different policies. For example with a Discrete action space, a # CategoricalMLPPolicy works, but for a Box action space may need to use # a GaussianMLPPolicy (see the trpo_gym_pendulum.py example) env = normalize( GymEnv(env_name="LunarLanderContinuous-v2", force_reset=True)) # policy = CategoricalMLPPolicy( # env_spec=env.spec, # # The neural network policy should have two hidden layers, each with 32 hidden units. # hidden_sizes=(32, 32) # ) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(64, 64)) baseline = LinearFeatureBaseline(env_spec=env.spec) # max_path_length = env.horizon algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=300, n_itr=10000, discount=0.99, # step_size=0.02, truncate_local_is_ratio=0.2 # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): env = normalize( GymEnv(env_name="MountainCarContinuous-v0", force_reset=True)) max_path_length = 300 policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers hidden_sizes=(64, 64)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=100, n_updates_per_sample=1, max_path_length=max_path_length, epoch_length=900, min_pool_size=800, replay_pool_size=5000, n_epochs=1000, discount=0.99, scale_reward=0.1, qf_learning_rate=1e-3, policy_learning_rate=1e-4, ) algo.train()
def main(): env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False)) experts = load_latest_experts('data/pendulum', n=5) irl_model = GAIL(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=200, batch_size=1000, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.0, # GAIL should not use entropy unless for exploration zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec)) with rllab_logdir(algo=algo, dirname='data/pendulum_gail'): with tf.Session(): algo.train()
def main(exp_name=None, fusion=False, visible_gpus='0', discount=0.99): env = TfEnv(GymEnv('Swimmer-v3', record_video=False, record_log=False)) gpu_options = tf.GPUOptions(allow_growth=True,visible_device_list=args.visible_gpus) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options) experts = load_latest_experts('data/swimmer', n=5, visible_gpus=visible_gpus) irl_model = AIRL(discount=discount, env=env, expert_trajs=experts, state_only=False, fusion=args.fusion, max_itrs=10) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=1000, discount=discount, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec) ) with rllab_logdir(algo=algo, dirname='data/swimmer_airl_state_action'): with tf.Session(config=tf_config) as sess: algo.train(sess)
def gym_env(name): from rllab.envs.gym_env import GymEnv return GymEnv( name, record_video=False, log_dir='/tmp/gym-test', # Ignore gym log. record_log=False)
def run_task(*_): env = normalize(GymEnv("DartWalker2d-v1")) policy = GaussianHMLPPropPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(64, 16), #subnet_split1=[5, 6, 7, 8, 9, 21, 22, 23, 24, 25], #subnet_split2=[10, 11, 12, 13, 14, 26, 27, 28, 29, 30], #sub_out_dim=6, #option_dim=4, sub_out_dim=3, option_dim=2, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=50000, max_path_length=env.horizon, n_itr=1000, discount=0.99, step_size=0.01, epopt_epsilon=1.0, epopt_after_iter=0, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def main(): env = TfEnv( GymEnv('HRI_AirSim_Landing-v0', record_video=False, record_log=False)) ### VGG 11/29/18: Added support to CSV files ## this method loads expert data saved as pickle file # experts = load_latest_experts('data/airsim_final', n=1) # this one uses csv: experts = load_experts('data/airsim_human_data/log.csv', pickle_format=False) irl_model = GAIL(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=5000, batch_size=60, max_path_length=60, discount=0.99, store_paths=True, discrim_train_itrs=100, irl_model_wt=1.0, entropy_weight=0.0, # GAIL should not use entropy unless for exploration zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), n_parallel=0) with rllab_logdir(algo=algo, dirname='data/airsim_gail'): with tf.Session(): algo.train()
def main(): env = TfEnv(GymEnv('Ant-v1', record_video=False, record_log=False)) experts = load_latest_experts('data/ant', n=50) irl_model = GCLDiscrim( env_spec=env.spec, expert_trajs=experts, discrim_arch=disentangled_net) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=2000, batch_size=10000, max_path_length=1000, discount=0.995, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec) ) with rllab_logdir(algo=algo, dirname='data/ant_airl'): with tf.Session(): algo.train()