示例#1
0
def test_soft_opc_scorer(observation_shape, action_size, n_episodes,
                         episode_length, threshold):
    # projection matrix for deterministic action
    A = np.random.random(observation_shape + (action_size, ))
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        actions = np.matmul(observations, A).astype('f4')
        rewards = np.random.random((episode_length, 1)).astype('f4')
        episode = Episode(observation_shape, action_size,
                          observations.astype('f4'), actions, rewards)
        episodes.append(episode)

    algo = DummyAlgo(A, 0.0)
    success_values = []
    all_values = []
    for episode in episodes:
        is_success = episode.compute_return() >= threshold
        batch = TransitionMiniBatch(episode.transitions)
        values = algo.predict_value(batch.observations, batch.actions)
        if is_success:
            success_values += values.tolist()
        all_values += values.tolist()

    scorer = soft_opc_scorer(threshold)
    score = scorer(algo, episodes)
    assert np.allclose(score, np.mean(success_values) - np.mean(all_values))
示例#2
0
def _get_scorers(discrete_action, dataset_stats):
    scorers = {}
    scorers["td_error"] = td_error_scorer
    scorers["discounted_sum_of_advantage"] = discounted_sum_of_advantage_scorer
    scorers["value_scale"] = average_value_estimation_scorer
    scorers["value_standard_deviation"] = value_estimation_std_scorer
    scorers["initial_state_value"] = initial_state_value_estimation_scorer
    scorers["soft_opc"] = soft_opc_scorer(0.8 * dataset_stats["return"]["max"])
    if discrete_action:
        scorers["action_match"] = discrete_action_match_scorer
    else:
        scorers["action_difference"] = continuous_action_diff_scorer
    return scorers
示例#3
0
from d3rlpy.metrics.scorer import evaluate_on_environment
from d3rlpy.metrics.scorer import initial_state_value_estimation_scorer
from d3rlpy.metrics.scorer import soft_opc_scorer

dataset, env = get_pybullet('hopper-bullet-mixed-v0')

train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

# train algorithm
cql = CQL(n_epochs=100, use_gpu=True)
cql.fit(train_episodes,
        eval_episodes=test_episodes,
        scorers={
            'environment': evaluate_on_environment(env),
            'init_value': initial_state_value_estimation_scorer,
            'soft_opc': soft_opc_scorer(600)
        })

# or load the trained model
# cql = CQL.from_json('<path-to-json>/params.json')
# cql.load_model('<path-to-model>/model.pt')

# evaluate the trained policy
fqe = FQE(algo=cql,
          n_epochs=200,
          q_func_factory='qr',
          learning_rate=1e-4,
          use_gpu=True,
          encoder_params={'hidden_units': [1024, 1024, 1024, 1024]})
fqe.fit(dataset.episodes,
        eval_episodes=dataset.episodes,
示例#4
0
文件: fqe_atari.py 项目: wx-b/d3rlpy
dataset, env = get_atari('breakout-expert-v0')

train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

# train algorithm
cql = DiscreteCQL(n_epochs=100,
                  scaler='pixel',
                  q_func_factory='qr',
                  n_frames=4,
                  use_gpu=True)
cql.fit(train_episodes,
        eval_episodes=test_episodes,
        scorers={
            'environment': evaluate_on_environment(env, epsilon=0.05),
            'init_value': initial_state_value_estimation_scorer,
            'soft_opc': soft_opc_scorer(70)
        })

# or load the trained model
# cql = DiscreteCQL.from_json('<path-to-json>/params.json')
# cql.load_model('<path-to-model>/model.pt')

# evaluate the trained policy
fqe = DiscreteFQE(algo=cql,
                  n_epochs=100,
                  q_func_factory='qr',
                  learning_rate=1e-4,
                  scaler='pixel',
                  n_frames=4,
                  discrete_action=True,
                  use_gpu=True)