env=mdp, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=200, n_itr=2000, step_size=0.01, subsample_factor=1.0, optimizer_args={'num_slices': 10}, ) algorithm = ICM( mdp, algo, "/home/fred/box3d/trpo_box3d_state_v10_tf_icm_cos_%d" % seed, feature_dim=mdp.spec.observation_space.flat_dim, forward_weight=0.1, external_reward_weight=0.0, inverse_tanh=True, init_learning_rate=1e-4, ) run_experiment_lite(algorithm.train(), exp_prefix='trpo_box3d_state_v10_tf_icm_cos', n_parallel=1, snapshot_mode="gap", snapshot_gap=200, seed=seed, mode="local")
batch_size = 2400 idle = Idle( env=env, policy=policy, baseline=baseline, ) icm = ICM( env, idle, '/home/fshentu/box3d/trpo_box3d_pixel_v8_tf_idle_normalize_fw_%d' % seed, feature_dim=256, forward_weight=0.2, inverse_tanh=True, init_learning_rate=1e-4, icm_batch_size=128, replay_pool_size=1000000, n_updates_per_iter=200, obs_dtype='uint8', normalize_input=True, ) run_experiment_lite(icm.train(), exp_prefix='trpo_box3d_pixel_v8_tf_idle_normalize', n_parallel=1, snapshot_mode="gap", snapshot_gap=200, seed=seed, mode="local")
def main(): parser = argparse.ArgumentParser() # Hyperparameters parser.add_argument('--fw_ratio', type=float, default=0.1) parser.add_argument('--init_lr', type=float, default=5e-4) parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard') parser.add_argument('--gpu_ratio', type=float, default=0.99) args = parser.parse_args() # Param ranges seeds = range(2) for seed in seeds: mdp = TfEnv(normalize(env=GymEnv('Box3dReach-v17',record_video=False, \ log_dir='/tmp/gym_test',record_log=False), normalize_obs=True)) name = 'trpo-state-v17-tf-icm-fw{}-initlr-{}-norm'.format( args.fw_ratio, args.init_lr) policy = GaussianMLPPolicy( "mlp_policy", env_spec=mdp.spec, hidden_sizes=(64, 64, 32), output_nonlinearity=tf.nn.tanh, clip_action=False, ) baseline = LinearFeatureBaseline(mdp.spec, ) batch_size = 50000 algo = TRPO( env=mdp, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=1000, n_itr=1000, step_size=0.01, subsample_factor=1.0, sampler_cls=BatchSampler, ) algorithm = ICM( mdp, algo, args.tfboard_path + "/%s_%d" % (name, seed), feature_dim=mdp.spec.observation_space.flat_dim, forward_weight=args.fw_ratio, external_reward_weight=0.0, replay_pool_size=1000000, init_learning_rate=args.init_lr, n_updates_per_iter=1000, ) run_experiment_lite(algorithm.train(), exp_prefix=name, n_parallel=8, snapshot_mode="gap", snapshot_gap=200, seed=seed, mode="local")
) batch_size = 2400 idle = Idle( env=env, policy=policy, baseline=baseline, ) icm = ICM( env, idle, '/home/fred/box3d/trpo_box3d_pixel_v8_tf_idle_%d'%seed, feature_dim=256, forward_weight=0.2, inverse_tanh=True, init_learning_rate=1e-4, icm_batch_size=128, replay_pool_size=1000000, n_updates_per_iter=200, obs_dtype='uint8', ) run_experiment_lite( icm.train(), exp_prefix='trpo_box3d_pixel_v8_tf_idle', n_parallel=1, snapshot_mode="gap", snapshot_gap=200, seed=seed, mode="local"
) baseline = LinearFeatureBaseline(mdp.spec, ) batch_size = 50000 idle = Idle( env=mdp, policy=policy, baseline=baseline, n_itr=2000, ) algorithm = ICM( mdp, idle, "/x/mujoco/tfboard_box3d/trpo_box3d_state_v10_tf_icm_idle_%d" % seed, feature_dim=mdp.spec.observation_space.flat_dim, forward_weight=0.1, external_reward_weight=0.0, init_learning_rate=1e-4, n_updates_per_iter=500, ) run_experiment_lite(algorithm.train(), exp_prefix='trpo_box3d_state_v10_tf_icm_idle', n_parallel=8, snapshot_mode="gap", snapshot_gap=200, seed=seed, mode="local")
batch_size=batch_size, whole_paths=True, max_path_length=200, n_itr=1000, step_size=0.01, subsample_factor=1.0, optimizer_args={'num_slices': 10}, sampler_cls=BatchSampler, ) algorithm = ICM( mdp, algo, "/home/fred/box3d/trpo_box3d_state_v10_tf_icm_cos_new_ext0.95_%d" % seed, feature_dim=mdp.spec.observation_space.flat_dim, forward_weight=0.1, external_reward_weight=0.95, inverse_tanh=True, init_learning_rate=1e-4, n_updates_per_iter=500) run_experiment_lite( algorithm.train(), exp_prefix='trpo_box3d_state_v10_tf_icm_cos_new_ext0.95', n_parallel=8, snapshot_mode="gap", snapshot_gap=200, seed=seed, mode="local")
step_size=0.01, subsample_factor=0.2, sampler_cls=BatchSampler, optimizer_args={ 'num_slices': 4, }) icm = ICM( env, algo, '/z/dianchen/box3d/trpo_box3d_pixel_v11_tf_icm_pretrained_cnn_norew_fw0.01_%d' % seed, forward_weight=0.01, external_reward_weight=0.0, init_learning_rate=1e-4, forward_cos=True, replay_pool_size=100000, n_updates_per_iter=1000, normalize_input=True, obs_dtype='uint8', pretrained_icm=True, pretrained_icm_path= '/z/dianchen/tfmodel_box3d/icm_supervised_box3dpixel_v11_box_dense_2e3_fw_0.01_lr_5e-4.pkl', ) run_experiment_lite( icm.train(), exp_prefix='trpo_box3d_pixel_v11_tf_icm_pretrained_cnn_norew_fw0.01', n_parallel=12, snapshot_mode="gap", snapshot_gap=200,
baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=1000, n_itr=1000, step_size=0.01, subsample_factor=1.0, sampler_cls=BatchSampler, ) algorithm = ICM( mdp, algo, "/media/4tb/box3d/trpo_box3d_state_v12_tf_icm_frozen_fw0.1_frozen_%d" % seed, feature_dim=mdp.spec.observation_space.flat_dim, pretrained_icm=True, pretrained_icm_path= "/home/fred/rllab/data/local/trpo-state-v12-tf-icm-fw0.1-initlr-0.001/trpo-state-v12-tf-icm-fw0.1-initlr-0.001_2017_07_16_22_12_20_0001/itr_1000.pkl", freeze_icm=True, external_reward_weight=0.0, ) run_experiment_lite(algorithm.train(), exp_prefix='trpo_box3d_state_v12_tf_icm_frozen_fw0.1', n_parallel=8, snapshot_mode="gap", snapshot_gap=200, seed=seed, mode="local")
baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=1000, n_itr=10000, step_size=0.01, subsample_factor=1.0, sampler_cls=BatchSampler, ) algorithm = ICM( mdp, algo, "/x/mujoco/tfboard_box3d/trpo_box3d_state_v12_tf_icm_fw0.3_5e-4_%d"%seed, feature_dim=mdp.spec.observation_space.flat_dim, forward_weight=0.3, external_reward_weight=0.0, replay_pool_size=100000000, init_learning_rate=5e-4, n_updates_per_iter=2000, ) run_experiment_lite( algorithm.train(), exp_prefix='trpo_box3d_state_v12_tf_icm_fw0.3_5e-4', n_parallel=8, snapshot_mode="gap", snapshot_gap=200, seed=seed, mode="local" )
env=mdp, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=200, n_itr=1000, step_size=0.01, subsample_factor=1.0, sampler_cls=BatchSampler, ) algorithm = ICM( mdp, algo, "/z/dianchen/box3d/trpo_box3d_state_v4_tf_icm_ext0.9995_%d" % seed, feature_dim=mdp.spec.observation_space.flat_dim, forward_weight=0.1, external_reward_weight=0.9995, replay_pool_size=500000, init_learning_rate=5e-4, n_updates_per_iter=1000) run_experiment_lite(algorithm.train(), exp_prefix='trpo_box3d_state_v4_tf_icm_ext0.9995', n_parallel=6, snapshot_mode="gap", snapshot_gap=100, seed=seed, mode="local")
env=mdp, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=500, n_itr=1000, step_size=0.01, subsample_factor=1.0, ) algorithm = ICM( mdp, algo, "/data0/dianchen/box3d/trpo_box3d_state_v4_tf_icm", feature_dim=mdp.spec.observation_space.flat_dim, forward_weight=0.2, external_reward_weight=0.0, inverse_tanh=True, init_learning_rate=1e-4, ) run_experiment_lite( algorithm.train(), exp_prefix='trpo_box3d_state_v4_tf_icm', n_parallel=1, snapshot_mode="gap", snapshot_gap=100, seed=seed, mode="local" )
mdp.spec, ) batch_size = 5000 algo = Idle( env=mdp, policy=policy, baseline=baseline, ) algorithm = ICM( mdp, algo, "/home/dianchen/box3d/trpo_box3d_state_v4_tf_icm_idle", no_encoder=False, feature_dim=mdp.spec.observation_space.flat_dim, forward_weight=0.2, external_reward_weight=0.99, inverse_tanh=True, init_learning_rate=1e-3, ) run_experiment_lite( algorithm.train(), exp_prefix='trpo_box3d_state_v4_tf_icm_idle', n_parallel=1, snapshot_mode="gap", snapshot_gap=100, seed=seed, mode="local" )
policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=200, n_itr=2000, step_size=0.01, subsample_factor=1.0, ) icm = ICM( env, trpo, '/home/fred/box3d/trpo_box3d_pixel_v7_tf_icm_%d' % seed, feature_dim=256, forward_weight=0.2, inverse_tanh=True, init_learning_rate=1e-4, icm_batch_size=128, replay_pool_size=1000000, n_updates_per_iter=200, ) run_experiment_lite(icm.train(), exp_prefix='trpo_box3d_pixel_v7_tf_icm', n_parallel=1, snapshot_mode="gap", snapshot_gap=200, seed=seed, mode="local")
baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=200, n_itr=1000, step_size=0.01, subsample_factor=1.0, sampler_cls=BatchSampler, ) algorithm = ICM( mdp, algo, "/home/fred/box3d/trpo_box3d_state_v11_tf_icm_cos_ext0.995_%d"%seed, feature_dim=mdp.spec.observation_space.flat_dim, forward_weight=0.1, external_reward_weight=0.995, init_learning_rate=1e-4, replay_pool_size=500000, n_updates_per_iter=500 ) run_experiment_lite( algorithm.train(), exp_prefix='trpo_box3d_state_v11_tf_icm_cos_ext0.995', n_parallel=8, snapshot_mode="gap", snapshot_gap=200, seed=seed, mode="local" )