output_nonlinearity=tf.nn.tanh, ) baseline = NNBaseline( env_spec=env_spec, feature_network=cnn, hidden_sizes=(128, 64), hidden_nonlinearity=tf.nn.relu, init_lr=0.001, n_itr=5, ) batch_size = 2400 idle = Idle( env=env, policy=policy, baseline=baseline, ) icm = ICM( env, idle, '/home/fshentu/box3d/trpo_box3d_pixel_v8_tf_idle_normalize_fw_%d' % seed, feature_dim=256, forward_weight=0.2, inverse_tanh=True, init_learning_rate=1e-4, icm_batch_size=128, replay_pool_size=1000000, n_updates_per_iter=200,
mdp = TfEnv(normalize(env=GymEnv('Box3dReach-v10',record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) policy = GaussianMLPPolicy( "mlp_policy", env_spec=mdp.spec, hidden_sizes=(64, 64, 32), output_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(mdp.spec, ) batch_size = 50000 idle = Idle( env=mdp, policy=policy, baseline=baseline, n_itr=2000, ) algorithm = ICM( mdp, idle, "/x/mujoco/tfboard_box3d/trpo_box3d_state_v10_tf_icm_idle_%d" % seed, feature_dim=mdp.spec.observation_space.flat_dim, forward_weight=0.1, external_reward_weight=0.0, init_learning_rate=1e-4, n_updates_per_iter=500, ) run_experiment_lite(algorithm.train(),
mdp = TfEnv(normalize(env=GymEnv('Box3dReach-v2',record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) policy = GaussianMLPPolicy( "mlp_policy", env_spec=mdp.spec, hidden_sizes=(64, 32), output_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(mdp.spec, ) batch_size = 5000 algo = Idle( env=mdp, policy=policy, baseline=baseline, ) algorithm = ICM( mdp, algo, "/home/dianchen/box3d/trpo_box3d_state_v2_tf_idle_%d" % seed, feature_dim=mdp.spec.observation_space.flat_dim, forward_weight=0.2, external_reward_weight=0.99, inverse_tanh=True, init_learning_rate=1e-4, ) run_experiment_lite(algorithm.train(),
stub(globals()) # Param ranges seeds = range(5) for seed in seeds: mdp = TfEnv(normalize(env=GymEnv('Box3dReach-v8',record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) policy = UniformControlPolicy(mdp.spec) baseline = LinearFeatureBaseline(mdp.spec, ) batch_size = 5000 algo = Idle(mdp, policy, baseline, n_itr=1000) algorithm = ICM( mdp, algo, "/home/dianchen/box3d/trpo_box3d_state_v8_tf_idle_%d" % seed, feature_dim=mdp.spec.observation_space.flat_dim, forward_weight=0.2, external_reward_weight=0.99, inverse_tanh=True, init_learning_rate=1e-4, ) run_experiment_lite(algorithm.train(), exp_prefix='trpo_box3d_state_v8_tf_idle', n_parallel=1,
policy = GaussianMLPPolicy( "mlp_policy", env_spec=mdp.spec, hidden_sizes=(64, 64, 32), output_nonlinearity=tf.nn.tanh, clip_action=False, ) baseline = LinearFeatureBaseline(mdp.spec, ) batch_size = 50000 idle = Idle( env=mdp, policy=policy, baseline=baseline, n_itr=1000, max_path_length=1000, batch_size=batch_size, sampler_cls=BatchSampler, ) algorithm = ICM( mdp, idle, "/home/fred/tfboard_path/trpo_box3d_state_v17_tf_icm_idle_%d" % seed, feature_dim=mdp.spec.observation_space.flat_dim, forward_weight=0.3, external_reward_weight=0.0, init_learning_rate=1e-4, n_updates_per_iter=500, )