agent.load_models()
    np.random.seed(0)

    score_history = []
    for i in range(200):
        obs = env.reset()
        done = False
        score = 0
        step = 0
        while not done:
            step += 1
            # print(obs)
            act = agent.choose_action(obs)
            # print(act)
            new_state, reward, done, info = env.step(act)
            agent.remember(obs, act, reward, new_state, int(done))
            agent.learn()
            score += reward
            obs = new_state
            env.render()
        score_history.append(score)

        # if i % 25 == 0:
        #     agent.save_models()

        print('episode ', i, 'score %.2f' % score,
              'trailing 128 games avg %.3f' % np.mean(score_history[-128:]),
              'finished after ', step, ' episode')
    env.close()
    agent.save_models()
    filename = 'MountainCar-alpha000025-beta00025-400-300.png'
示例#2
0
def ddpg(env,
         state_size,
         action_size,
         num_agents,
         brain_name,
         n_episodes=1000,
         max_t=1000,
         print_every=10,
         title=None,
         batch_size=128,
         gamma=0.99,
         tau=1e-3,
         lr_actor=1e-4,
         lr_critic=1e-3,
         weight_decay=0,
         device="cuda:0",
         fc1_units=128,
         fc2_units=64,
         n_updates=10,
         update_intervals=20):

    agent = Agent(state_size=state_size,
                  action_size=action_size,
                  random_seed=2,
                  num_agents=num_agents,
                  batch_size=batch_size,
                  gamma=gamma,
                  tau=tau,
                  lr_actor=lr_actor,
                  lr_critic=lr_critic,
                  weight_decay=weight_decay,
                  device=device,
                  fc1_units=fc1_units,
                  fc2_units=fc2_units)

    # create save directory
    if title is None:
        title = "experiment"
    current_time = strftime("%Y-%m-%d_%H:%M:%S", gmtime())
    title = title + "_" + current_time

    # write a new file
    os.makedirs("experiments/{}".format(title), exist_ok=True)
    f = open("experiments/{}/scores.txt".format(title), "w")
    f.close()

    scores_deque = deque(maxlen=100)
    mean_scores = []

    for i_episode in range(1, n_episodes + 1):

        env_info = env.reset(
            train_mode=True)[brain_name]  # reset the environment
        states = env_info.vector_observations

        scores = np.zeros(num_agents)
        agent.reset()

        for t in range(max_t):
            # 1. observe states with the current policty mu theta + noise
            actions = agent.act(states)

            # 2. Execute a in the environment and observe next state (s,a,r,s',d')
            env_info = env.step(actions)[
                brain_name]  # send all actions to tne environment
            next_states = env_info.vector_observations  # get next state (for each agent)
            rewards = env_info.rewards  # get reward (for each agent)
            dones = env_info.local_done  # see if episode finished

            # 3. save experiences to the replay buffer
            agent.remember(states, actions, rewards, next_states, dones)

            # 4. learn by sampling from the replay buffer
            # if it is time to update, for however many updates
            agent.update(n_updates, update_intervals, t)

            scores += env_info.rewards  # update the score (for each agent)
            states = next_states  # roll over states to next time step

            if np.any(dones):
                break
        scores_deque.append(np.mean(scores))
        print('\rEpisode {}\tLast 100 average Score: {:.2f}'.format(
            i_episode, np.mean(scores_deque)),
              end="")

        # save score and model every print_every
        if i_episode % print_every == 0:
            f = open("experiments/{}/scores.txt".format(title), "a")
            f.write("{},{}\n".format(i_episode, np.mean(scores_deque)))
            f.close()
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_deque)))
            mean_scores.append(np.mean(scores_deque))
            # save if best model
            if np.mean(scores_deque) == max(mean_scores):
                torch.save(agent.actor_local.state_dict(),
                           'experiments/{}/checkpoint_actor.pth'.format(title))
                torch.save(
                    agent.critic_local.state_dict(),
                    'experiments/{}/checkpoint_critic.pth'.format(title))

            if np.mean(scores_deque) >= 30:
                print("\rEnvironment solved with average score of 30")
                break

    return mean_scores, title