Python Policy.train 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: policies.policy

클래스/타입: Policy

메소드/함수: train

hotexamples.com에서의 예제들: 2

Python Policy.train - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 policies.policy.Policy.train에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

__init__(8)

Policy(4)

to(4)

act(3)

load_state_dict(3)

state_dict(3)

train(2)

update(2)

eval(1)

get_action(1)

get_value(1)

sample(1)

update_observation(1)

예제 #1

파일 보기

def main():
    torch.set_num_threads(1)
    torch.manual_seed(0)

    env = gym.make(env_name)
    env.seed(seed)

    print('New model')
    policy = Policy('actor_critic', env.observation_space.shape[0], env.action_space.n)
    policy.to(device)

    optimizer = PPO(policy, clip_param, ppo_epoch, mini_batch_size,
                value_loss_coef, entropy_coef, learning_rate,
                max_grad_norm)

    episode_rewards = deque(maxlen=50)

    for eps in range(0, n_eps + 1):
        state = env.reset()
        storage = Storage(device=device)

        policy.eval()

        episode_rewards.append(test_env(policy, gym.make(env_name)))
        if eps % 5 == 0:
            print('Avg reward', np.mean(episode_rewards))

        for step in range(n_steps):

            state = torch.FloatTensor(state).to(device)

            with torch.no_grad():
                value, action, log_prob = policy.act(state)

            next_state, reward, done, _ = env.step(action.item())

            storage.push(state, action, log_prob, value, reward, done)

            state = next_state

            if done:
                state = env.reset()

        next_state = torch.FloatTensor(next_state).to(device)
        with torch.no_grad():
            next_value = policy.get_value(next_state).detach()

        storage.compute(next_value)

        policy.train()

        value_loss, action_loss, dist_entropy = optimizer.update(storage)

        with open('metrics.csv', 'a') as metrics:
            metrics.write('{},{},{}\n'.format(value_loss, action_loss, dist_entropy))

예제 #2

파일 보기

파일: helpers.py 프로젝트: tesslerc/malmo_rl

def play_full_episode(agents: ParallelAgentsWrapper, policy: Policy, step: int, params: argparse, is_train: bool) \
        -> Tuple[ParallelAgentsWrapper, int, bool, bool, float, int, Dict[str, float]]:
    eval_required = False
    checkpoint_reached = False
    epoch_reward = 0
    rewards, terminals, states, terminals_due_to_timeout, success = agents.perform_actions(
        ['new game' for _ in range(params.number_of_agents)], is_train)  # Restart all the agents.

    log_dict = {}
    start_step = step
    successful_agents = [0 for _ in range(params.number_of_agents)]
    while not all([t or t is None for t in terminals]):  # Loop ends only when all agents have terminated.
        action = policy.get_action(states, is_train)
        rewards, terminals, states, terminals_due_to_timeout, success = agents.perform_actions(action, is_train)

        # reward is a list. Passing it to update_observation changes its values hence all references should be
        # performed prior to calling update_observation.
        for idx, reward in enumerate(rewards):
            if reward is not None:
                epoch_reward += reward
                if success[idx]:
                    successful_agents[idx] = 1
        logging.debug('step: %s, reward: %s, terminal: %s, terminal_due_to_timeout: %s, sucess: %s', step, rewards,
                      terminals, terminals_due_to_timeout, success)

        policy.update_observation(rewards, terminals, terminals_due_to_timeout, success, is_train)

        if is_train:
            single_log_dict = policy.train(states)
        else:
            single_log_dict = {}

        step += 1

        if step % params.eval_frequency == 0:
            eval_required = True
        if step % params.checkpoint_interval == 0:
            checkpoint_reached = True

        for item in single_log_dict:
            if item in log_dict:
                log_dict[item] = log_dict[item] + single_log_dict[item]
            else:
                log_dict[item] = single_log_dict[item]

    for item in log_dict:
        log_dict[item] = log_dict[item] * 1.0 / (step - start_step)
    return agents, step, eval_required, checkpoint_reached, epoch_reward, sum(successful_agents), log_dict