Python Trainer.obtain_episodes 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: garage.trainer

클래스/타입: Trainer

메소드/함수: obtain_episodes

hotexamples.com에서의 예제들: 2

Python Trainer.obtain_episodes - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 garage.trainer.Trainer.obtain_episodes에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Trainer(30)

setup(30)

train(30)

obtain_episodes(2)

restore(2)

resume(2)

obtain_samples(1)

step_episode(1)

예제 #1

파일 보기

파일: test_dqn.py 프로젝트: mugoh/garage

def test_double_dqn_loss(setup):
    algo, env, buff, _, batch_size = setup

    algo._double_q = True
    trainer = Trainer(snapshot_config)
    trainer.setup(algo, env)

    paths = trainer.obtain_episodes(0, batch_size=batch_size)
    buff.add_episode_batch(paths)
    timesteps = buff.sample_timesteps(algo._buffer_batch_size)
    timesteps_copy = copy.deepcopy(timesteps)

    observations = as_torch(timesteps.observations)
    rewards = as_torch(timesteps.rewards).reshape(-1, 1)
    actions = as_torch(timesteps.actions)
    next_observations = as_torch(timesteps.next_observations)
    terminals = as_torch(timesteps.terminals).reshape(-1, 1)

    next_inputs = next_observations
    inputs = observations
    with torch.no_grad():
        # double Q loss
        selected_actions = torch.argmax(algo._qf(next_inputs), axis=1)
        # use target qf to get Q values for those actions
        selected_actions = selected_actions.long().unsqueeze(1)
        best_qvals = torch.gather(algo._target_qf(next_inputs),
                                  dim=1,
                                  index=selected_actions)

    rewards_clipped = rewards
    y_target = (rewards_clipped +
                (1.0 - terminals) * algo._discount * best_qvals)
    y_target = y_target.squeeze(1)

    # optimize qf
    qvals = algo._qf(inputs)
    selected_qs = torch.sum(qvals * actions, axis=1)
    qval_loss = F.smooth_l1_loss(selected_qs, y_target)

    algo_loss, algo_targets, algo_selected_qs = algo._optimize_qf(
        timesteps_copy)
    env.close()

    assert (qval_loss.detach() == algo_loss).all()
    assert (y_target == algo_targets).all()
    assert (selected_qs == algo_selected_qs).all()

예제 #2

파일 보기

def test_dqn_loss(setup):
    algo, env, buff, _, batch_size = setup

    trainer = Trainer(snapshot_config)
    trainer.setup(algo, env, sampler_cls=LocalSampler)

    paths = trainer.obtain_episodes(0, batch_size=batch_size)
    buff.add_episode_batch(paths)
    timesteps = buff.sample_timesteps(algo._buffer_batch_size)
    timesteps_copy = copy.deepcopy(timesteps)

    observations = np_to_torch(timesteps.observations)
    rewards = np_to_torch(timesteps.rewards).reshape(-1, 1)
    actions = np_to_torch(timesteps.actions)
    next_observations = np_to_torch(timesteps.next_observations)
    terminals = np_to_torch(timesteps.terminals).reshape(-1, 1)

    next_inputs = next_observations
    inputs = observations
    with torch.no_grad():
        target_qvals = algo._target_qf(next_inputs)
        best_qvals, _ = torch.max(target_qvals, 1)
        best_qvals = best_qvals.unsqueeze(1)

    rewards_clipped = rewards
    y_target = (rewards_clipped +
                (1.0 - terminals) * algo._discount * best_qvals)
    y_target = y_target.squeeze(1)

    # optimize qf
    qvals = algo._qf(inputs)
    selected_qs = torch.sum(qvals * actions, axis=1)
    qval_loss = F.smooth_l1_loss(selected_qs, y_target)

    algo_loss, algo_targets, algo_selected_qs = algo._optimize_qf(
        timesteps_copy)
    env.close()

    assert (qval_loss.detach() == algo_loss).all()
    assert (y_target == algo_targets).all()
    assert (selected_qs == algo_selected_qs).all()