Python Monitor.get_episode_rewards示例

def test_one(agent, dir_record, itr):
    agent.env.seed(itr)

    env_record = Monitor(agent.env, directory=dir_record)

    ob = env_record.reset()
    agent.frame_sequence.insert(atari_img_preprocess(ob))
    while True:
        fs1 = agent.frame_sequence.memory_as_array()
        ## Find next action
        action = agent.next_action()
        ob, reward, done, _ = env_record.step(action)
        agent.frame_sequence.insert(atari_img_preprocess(ob))
        fs2 = agent.frame_sequence.memory_as_array()
        ## Save results into the replay memory
        agent.replay_memory.insert(fs1, action, np.clip(reward, -1, 1), fs2, done)
        if done:
            break
    #end

    total_reward = env_record.get_episode_rewards()[0]
    
    env_record.close()

    return total_reward
#end

示例#2

显示文件

def train_one(agent, dir_record, seed=None):
    if not seed is None:
        agent.env.seed(seed)

    env_record = Monitor(agent.env, directory=dir_record)

    ob = env_record.reset()
    agent.frame_sequence.reset()
    agent.frame_sequence.insert(atari_img_preprocess(ob))
    while True:
        fs1 = agent.frame_sequence.memory_as_array()
        ## Find next action
        action = agent.next_action()
        ob, reward, done, _ = env_record.step(action)
        agent.frame_sequence.insert(atari_img_preprocess(ob))
        fs2 = agent.frame_sequence.memory_as_array()
        ## Save results into the replay memory
        agent.replay_memory.insert(fs1, action, reward, fs2, done)
        ## Perform learning
        if len(agent.replay_memory.memory) >= REPLAY_START_SIZE:
            agent.learn()
        ## If done == True, then this game is finished
        if done:
            break
    #end

    ## Save the model
    agent.save_model(os.path.join(dir_record, 'model.ckpt'))

    total_reward = env_record.get_episode_rewards()[0]

    env_record.close()

    ## Save cost graph per iteration
    costs_episode = zip(*agent.costs)
    fig = plt.figure()
    plt.plot(*costs_episode)
    plt.title('Costs during training the agent')
    plt.xlabel('Iteration')
    plt.ylabel('Cost')
    fig.savefig(os.path.join(dir_record, 'costs.png'))
    plt.close(fig)

    ## Save error graph per iteration
    errors_episode = zip(*agent.errors)
    fig = plt.figure()
    plt.plot(*errors_episode)
    plt.title('Errors during training the agent')
    plt.xlabel('Iteration')
    plt.ylabel('Error')
    fig.savefig(os.path.join(dir_record, 'errors.png'))
    plt.close(fig)

    return total_reward


#end

示例#3

显示文件

文件： run_soccer_pdqn.py 项目： Toxicist/MultiCloudService

def run(seed, episodes, batch_size, gamma, inverting_gradients,
        initial_memory_threshold, replay_memory_size, epsilon_steps, tau_actor,
        tau_actor_param, use_ornstein_noise, learning_rate_actor,
        learning_rate_actor_param, title, epsilon_final, clip_grad, beta,
        scale_actions, split, indexed, zero_index_gradients,
        action_input_layer, evaluation_episodes, multipass, weighted, average,
        random_weighted, update_ratio, save_freq, save_dir, layers):

    if save_freq > 0 and save_dir:
        save_dir = os.path.join(save_dir, title + "{}".format(str(seed)))
        os.makedirs(save_dir, exist_ok=True)

    env = make_env(scale_actions)
    dir = os.path.join(save_dir, title)
    env = Monitor(env,
                  directory=os.path.join(dir, str(seed)),
                  video_callable=False,
                  write_upon_reset=False,
                  force=True)
    # env.seed(seed)  # doesn't work on HFO
    np.random.seed(seed)

    from agents.pdqn_nstep import PDQNNStepAgent
    from agents.pdqn_split_nstep import PDQNNStepSplitAgent
    from agents.pdqn_multipass_nstep import MultiPassPDQNNStepAgent
    assert not (split and multipass)
    agent_class = PDQNNStepAgent
    if split:
        agent_class = PDQNNStepSplitAgent
    elif multipass:
        agent_class = MultiPassPDQNNStepAgent
    assert action_input_layer >= 0
    if action_input_layer > 0:
        assert split
    agent = agent_class(
        env.observation_space,
        env.action_space,
        actor_kwargs={
            "hidden_layers": layers,
            'action_input_layer': action_input_layer,
            'activation': "leaky_relu",
            'output_layer_init_std': 0.01
        },
        actor_param_kwargs={
            "hidden_layers": layers,
            'activation': "leaky_relu",
            'output_layer_init_std': 0.01
        },
        batch_size=batch_size,
        learning_rate_actor=learning_rate_actor,  # 0.0001
        learning_rate_actor_param=learning_rate_actor_param,  # 0.001
        epsilon_steps=epsilon_steps,
        epsilon_final=epsilon_final,
        gamma=gamma,  # 0.99
        tau_actor=tau_actor,
        tau_actor_param=tau_actor_param,
        clip_grad=clip_grad,
        beta=beta,
        indexed=indexed,
        weighted=weighted,
        average=average,
        random_weighted=random_weighted,
        initial_memory_threshold=initial_memory_threshold,
        use_ornstein_noise=use_ornstein_noise,
        replay_memory_size=replay_memory_size,
        inverting_gradients=inverting_gradients,
        zero_index_gradients=zero_index_gradients,
        seed=seed)
    print(agent)
    network_trainable_parameters = sum(p.numel()
                                       for p in agent.actor.parameters()
                                       if p.requires_grad)
    network_trainable_parameters += sum(
        p.numel() for p in agent.actor_param.parameters() if p.requires_grad)
    print("Total Trainable Network Parameters: %d" %
          network_trainable_parameters)
    max_steps = 15000
    total_reward = 0.
    returns = []
    timesteps = []
    goals = []
    start_time_train = time.time()

    for i in range(episodes):
        if save_freq > 0 and save_dir and i % save_freq == 0:
            agent.save_models(os.path.join(save_dir, str(i)))
        info = {'status': "NOT_SET"}
        state = env.reset()
        state = np.array(state, dtype=np.float32, copy=False)

        act, act_param, all_action_parameters = agent.act(state)
        action = pad_action(act, act_param)
        episode_reward = 0.
        agent.start_episode()
        transitions = []
        for j in range(max_steps):
            next_state, reward, terminal, info = env.step(action)
            next_state = np.array(next_state, dtype=np.float32, copy=False)
            # status = info['status']
            # if status != 'IN_GAME':
            #     print(status)

            next_act, next_act_param, next_all_action_parameters = agent.act(
                next_state)
            next_action = pad_action(next_act, next_act_param)
            transitions.append([
                state,
                np.concatenate(([act], all_action_parameters.data)).ravel(),
                reward, next_state,
                np.concatenate(
                    ([next_act], next_all_action_parameters.data)).ravel(),
                terminal
            ])

            act, act_param, all_action_parameters = next_act, next_act_param, next_all_action_parameters
            action = next_action
            state = next_state
            episode_reward += reward
            #env.render()

            if terminal:
                break
        agent.end_episode()

        # calculate n-step returns
        n_step_returns = compute_n_step_returns(transitions, gamma)
        for t, nsr in zip(transitions, n_step_returns):
            t.append(nsr)
            agent.replay_memory.append(state=t[0],
                                       action=t[1],
                                       reward=t[2],
                                       next_state=t[3],
                                       next_action=t[4],
                                       terminal=t[5],
                                       time_steps=None,
                                       n_step_return=nsr)

        n_updates = int(update_ratio * j)
        for _ in range(n_updates):
            agent._optimize_td_loss()

        returns.append(episode_reward)
        timesteps.append(j)
        goals.append(info['status'] == 'GOAL')

        total_reward += episode_reward
        if i % 100 == 0:
            print('{0:5s} R:{1:.4f} r100:{2:.4f}'.format(
                str(i + 1), total_reward / (i + 1),
                np.array(returns[-100:]).mean()))
    end_time_train = time.time()
    if save_freq > 0 and save_dir:
        agent.save_models(os.path.join(save_dir, str(i)))

    returns = env.get_episode_rewards()
    np.save(os.path.join(dir, title + "{}".format(str(seed))),
            np.column_stack((returns, timesteps, goals)))

    if evaluation_episodes > 0:
        print("Evaluating agent over {} episodes".format(evaluation_episodes))
        agent.epsilon_final = 0.
        agent.epsilon = 0.
        agent.noise = None
        agent.actor.eval()
        agent.actor_param.eval()
        start_time_eval = time.time()
        evaluation_results = evaluate(
            env, agent, evaluation_episodes)  # returns, timesteps, goals
        end_time_eval = time.time()
        print("Ave. evaluation return =",
              sum(evaluation_results[:, 0]) / evaluation_results.shape[0])
        print("Ave. timesteps =",
              sum(evaluation_results[:, 1]) / evaluation_results.shape[0])
        goal_timesteps = evaluation_results[:, 1][evaluation_results[:,
                                                                     2] == 1]
        if len(goal_timesteps) > 0:
            print("Ave. timesteps per goal =",
                  sum(goal_timesteps) / evaluation_results.shape[0])
        else:
            print("Ave. timesteps per goal =",
                  sum(goal_timesteps) / evaluation_results.shape[0])
        print("Ave. goal prob. =",
              sum(evaluation_results[:, 2]) / evaluation_results.shape[0])
        np.save(os.path.join(dir, title + "{}e".format(str(seed))),
                evaluation_results)
        print("Evaluation time: %.2f seconds" %
              (end_time_eval - start_time_eval))
    print("Training time: %.2f seconds" % (end_time_train - start_time_train))

    print(agent)
    env.close()