예제 #1
0
def experiment(variant):
    env = variant['env_class']()
    env = normalize(env)
    es = OUStrategy(action_space=env.action_space)
    qf = FeedForwardQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        400,
        300,
    )
    policy = FeedForwardPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        400,
        300,
    )
    epoch_discount_schedule_class = variant['epoch_discount_schedule_class']
    epoch_discount_schedule = epoch_discount_schedule_class(
        **variant['epoch_discount_schedule_params'])
    algorithm = DDPG(env,
                     exploration_strategy=es,
                     qf=qf,
                     policy=policy,
                     epoch_discount_schedule=epoch_discount_schedule,
                     **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
예제 #2
0
def experiment(variant):
    # env = HalfCheetahEnv()
    # env = PointEnv()
    env = gym_env("Pendulum-v0")
    # env = HopperEnv()
    horizon = variant['algo_params']['max_path_length']
    env = TimeLimitedEnv(env, horizon)
    env = normalize(env)
    es = OUStrategy(action_space=env.action_space)
    qf = FeedForwardQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        400,
        300,
    )
    policy = FeedForwardPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        400,
        300,
    )
    algorithm = MultiStepDdpg(env,
                              exploration_strategy=es,
                              qf=qf,
                              policy=policy,
                              **variant['algo_params'])
    algorithm.train()
    return algorithm.final_score
예제 #3
0
def example(variant):
    env = CartpoleEnv()
    env = NormalizedBoxEnv(env)
    es = OUStrategy(action_space=env.action_space)
    qf = FeedForwardQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        **variant['qf_params'],
    )
    policy = FeedForwardPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        400,
        300,
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(
        env,
        qf,
        policy,
        exploration_policy,
        **variant['algo_params']
    )
    algorithm.to(ptu.device)
    algorithm.train()
예제 #4
0
def example(variant):
    env = HalfCheetahEnv()
    if variant['normalize']:
        env = normalize(env)
    es = OUStrategy(action_space=env.action_space)
    qf = FeedForwardQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        32,
        32,
    )
    policy = FeedForwardPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        32,
        32,
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(env,
                     qf=qf,
                     policy=policy,
                     exploration_policy=exploration_policy,
                     **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
예제 #5
0
def experiment(variant):
    env = variant['env_class'](**variant['env_params'])
    env = normalize(env)
    es = OUStrategy(
        action_space=env.action_space,
        **variant['es_params']
    )
    algo_class = variant['algo_class']
    algo_params = variant['algo_params']
    hidden_size = variant['hidden_size']
    if algo_class == DDPG:
        # algo_params.pop('naf_policy_learning_rate')
        qf = FeedForwardQFunction(
            int(env.observation_space.flat_dim),
            int(env.action_space.flat_dim),
            hidden_size,
            hidden_size,
        )
        policy = FeedForwardPolicy(
            int(env.observation_space.flat_dim),
            int(env.action_space.flat_dim),
            hidden_size,
            hidden_size,
        )
        algorithm = DDPG(
            env,
            exploration_strategy=es,
            qf=qf,
            policy=policy,
            **variant['algo_params']
        )
    elif algo_class == NAF:
        algo_params.pop('qf_learning_rate')
        # algo_params.pop('policy_learning_rate')
        qf = NafPolicy(
            int(env.observation_space.flat_dim),
            int(env.action_space.flat_dim),
            hidden_size,
        )
        algorithm = NAF(
            env,
            policy=qf,
            exploration_strategy=es,
            **variant['algo_params']
        )
    else:
        raise Exception("Invalid algo class: {}".format(algo_class))
    algorithm.to(ptu.device)
    algorithm.train()
예제 #6
0
def experiment(variant):
    env = NormalizedBoxEnv(MultiGoalEnv(
        actuation_cost_coeff=10,
        distance_cost_coeff=1,
        goal_reward=10,
    ))

    es = OUStrategy(action_space=env.action_space)
    qf = FeedForwardQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        100,
        100,
    )
    policy = FeedForwardPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        100,
        100,
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    plotter = QFPolicyPlotter(
        qf=qf,
        # policy=policy,
        policy=exploration_policy,
        obs_lst=np.array([[-2.5, 0.0],
                          [0.0, 0.0],
                          [2.5, 2.5]]),
        default_action=[np.nan, np.nan],
        n_samples=100
    )
    algorithm = DDPG(
        env,
        qf=qf,
        policy=policy,
        exploration_policy=exploration_policy,
        render_eval_paths=True,
        plotter=plotter,
        **variant['algo_params']
    )
    algorithm.to(ptu.device)
    algorithm.train()
예제 #7
0
def example(variant):
    env = variant['env_class']()
    env = normalize(env)
    es = OUStrategy(action_space=env.action_space)
    qf = FeedForwardQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        400,
        300,
    )
    policy = FeedForwardPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        400,
        300,
    )
    algorithm = DDPG(env,
                     exploration_strategy=es,
                     qf=qf,
                     policy=policy,
                     **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()