예제 #1
0
def evaluate(
        dt: float,
        epoch: int,
        env: Env,
        agent: Agent,
        eval_gap: float,  # noqa: C901
        time_limit: Optional[float] = None,
        eval_return: bool = False,
        progress_bar: bool = False,
        video: bool = False,
        no_log: bool = False,
        test: bool = False,
        eval_policy: bool = True) -> Optional[float]:
    """Evaluate agent in environment.

    :args dt: time discretization
    :args epoch: index of the current epoch
    :args env: environment
    :args agent: interacting agent
    :args eval_gap: number of normalized epochs (epochs divided by dt)
        between training steps
    :args time_limit: maximal physical time (number of steps divided by dt)
        spent in the environment
    :args eval_return: do we only perform specific evaluation?
    :args progress_bar: use a progress bar?
    :args video: log a video of the interaction?
    :args no_log: do we log results
    :args test: log to a different test summary
    :args eval_policy: if the exploitation policy is noisy,
        remove the noise before evaluating

    :return: return evaluated, None if no return is evaluated
    """
    log_gap = int(eval_gap / dt)
    agent.eval()
    if not eval_policy and isinstance(agent, OnlineAgent):
        agent.noisy_eval()
    agent.reset()
    R = None
    if eval_return:
        rewards, dones = [], []
        imgs = []
        time_limit = time_limit if time_limit else 10
        nb_steps = int(time_limit / dt)
        info(f"eval> evaluating on a physical time {time_limit}"
             f" ({nb_steps} steps in total)")
        obs = env.reset()
        iter_range = tqdm(range(nb_steps)) if progress_bar else range(nb_steps)
        for _ in iter_range:
            obs, reward, done = interact(env, agent, obs)
            rewards.append(reward)
            dones.append(done)
            if video:
                imgs.append(env.render(mode='rgb_array'))
        R = compute_return(np.stack(rewards, axis=0), np.stack(dones, axis=0))
        tag = "noisy" if not eval_policy else ""
        info(f"eval> At epoch {epoch}, {tag} return: {R}")
        if not no_log:
            if not eval_policy:
                log("Return_noisy", R, epoch)
            elif not video:  # don't log when outputing video
                if not test:
                    log("Return", R, epoch)
                else:
                    log("Return_test", R, epoch)
        if video:
            log_video("demo", epoch, np.stack(imgs, axis=0))

    if not no_log:
        specific_evaluation(epoch, log_gap, dt, env, agent)
    return R
예제 #2
0
            belief, posterior_state, action = torch.zeros(1, args.belief_size, device=args.device), torch.zeros(1,
                                                                                                                args.state_size,
                                                                                                                device=args.device), torch.zeros(
                1, env.action_size, device=args.device)
            pbar = tqdm(range(args.max_episode_length // args.action_repeat))
            for t in pbar:
                belief, posterior_state, action, observation, reward, done = update_belief_and_act(args, env, planner,
                                                                                                   transition_model,
                                                                                                   encoder, belief,
                                                                                                   posterior_state,
                                                                                                   action,
                                                                                                   observation.to(
                                                                                                       device=args.device))
                total_reward += reward
                if args.render:
                    env.render()
                if done:
                    pbar.close()
                    break
    print('Average Reward:', total_reward / args.test_episodes)
    env.close()
    quit()

# Training (and testing)
for episode in tqdm(range(metrics['episodes'][-1] + 1, args.episodes + 1), total=args.episodes,
                    initial=metrics['episodes'][-1] + 1):
    # Model fitting
    losses = []
    for s in tqdm(range(args.collect_interval)):
        # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags)
        observations, actions, rewards, nonterminals = D.sample(args.batch_size,