Exemplo n.º 1
0
        output_nonlinearity=tf.nn.tanh,
    )

    baseline = NNBaseline(
        env_spec=env_spec,
        feature_network=cnn,
        hidden_sizes=(128, 64),
        hidden_nonlinearity=tf.nn.relu,
        init_lr=0.001,
        n_itr=5,
    )

    batch_size = 2400
    idle = Idle(
        env=env,
        policy=policy,
        baseline=baseline,
    )

    icm = ICM(
        env,
        idle,
        '/home/fshentu/box3d/trpo_box3d_pixel_v8_tf_idle_normalize_fw_%d' %
        seed,
        feature_dim=256,
        forward_weight=0.2,
        inverse_tanh=True,
        init_learning_rate=1e-4,
        icm_batch_size=128,
        replay_pool_size=1000000,
        n_updates_per_iter=200,
Exemplo n.º 2
0
    mdp = TfEnv(normalize(env=GymEnv('Box3dReach-v10',record_video=False, \
    log_dir='/tmp/gym_test',record_log=False)))

    policy = GaussianMLPPolicy(
        "mlp_policy",
        env_spec=mdp.spec,
        hidden_sizes=(64, 64, 32),
        output_nonlinearity=tf.nn.tanh,
    )

    baseline = LinearFeatureBaseline(mdp.spec, )

    batch_size = 50000
    idle = Idle(
        env=mdp,
        policy=policy,
        baseline=baseline,
        n_itr=2000,
    )

    algorithm = ICM(
        mdp,
        idle,
        "/x/mujoco/tfboard_box3d/trpo_box3d_state_v10_tf_icm_idle_%d" % seed,
        feature_dim=mdp.spec.observation_space.flat_dim,
        forward_weight=0.1,
        external_reward_weight=0.0,
        init_learning_rate=1e-4,
        n_updates_per_iter=500,
    )

    run_experiment_lite(algorithm.train(),
    mdp = TfEnv(normalize(env=GymEnv('Box3dReach-v2',record_video=False, \
    log_dir='/tmp/gym_test',record_log=False)))

    policy = GaussianMLPPolicy(
        "mlp_policy",
        env_spec=mdp.spec,
        hidden_sizes=(64, 32),
        output_nonlinearity=tf.nn.tanh,
    )

    baseline = LinearFeatureBaseline(mdp.spec, )

    batch_size = 5000
    algo = Idle(
        env=mdp,
        policy=policy,
        baseline=baseline,
    )

    algorithm = ICM(
        mdp,
        algo,
        "/home/dianchen/box3d/trpo_box3d_state_v2_tf_idle_%d" % seed,
        feature_dim=mdp.spec.observation_space.flat_dim,
        forward_weight=0.2,
        external_reward_weight=0.99,
        inverse_tanh=True,
        init_learning_rate=1e-4,
    )

    run_experiment_lite(algorithm.train(),
Exemplo n.º 4
0
stub(globals())

# Param ranges
seeds = range(5)

for seed in seeds:
    mdp = TfEnv(normalize(env=GymEnv('Box3dReach-v8',record_video=False, \
    log_dir='/tmp/gym_test',record_log=False)))

    policy = UniformControlPolicy(mdp.spec)

    baseline = LinearFeatureBaseline(mdp.spec, )

    batch_size = 5000
    algo = Idle(mdp, policy, baseline, n_itr=1000)

    algorithm = ICM(
        mdp,
        algo,
        "/home/dianchen/box3d/trpo_box3d_state_v8_tf_idle_%d" % seed,
        feature_dim=mdp.spec.observation_space.flat_dim,
        forward_weight=0.2,
        external_reward_weight=0.99,
        inverse_tanh=True,
        init_learning_rate=1e-4,
    )

    run_experiment_lite(algorithm.train(),
                        exp_prefix='trpo_box3d_state_v8_tf_idle',
                        n_parallel=1,
Exemplo n.º 5
0
    policy = GaussianMLPPolicy(
        "mlp_policy",
        env_spec=mdp.spec,
        hidden_sizes=(64, 64, 32),
        output_nonlinearity=tf.nn.tanh,
        clip_action=False,
    )

    baseline = LinearFeatureBaseline(mdp.spec, )

    batch_size = 50000
    idle = Idle(
        env=mdp,
        policy=policy,
        baseline=baseline,
        n_itr=1000,
        max_path_length=1000,
        batch_size=batch_size,
        sampler_cls=BatchSampler,
    )

    algorithm = ICM(
        mdp,
        idle,
        "/home/fred/tfboard_path/trpo_box3d_state_v17_tf_icm_idle_%d" % seed,
        feature_dim=mdp.spec.observation_space.flat_dim,
        forward_weight=0.3,
        external_reward_weight=0.0,
        init_learning_rate=1e-4,
        n_updates_per_iter=500,
    )