示例#1
0
def train(episodes=700, max_t=1000):
    try:
        agent = Agent(state_size=33, action_size=4, seed=0)
        scores = []
        scores_window = deque(maxlen=100)
        config = Config()
        eps = config.EPS_START

        for i_episode in range(1, episodes + 1):
            env_info = env.reset(train_mode=True)[brain_name]
            state = env_info.vector_observations
            agent.reset()
            score = np.zeros(1)
            for _ in range(max_t + 1):
                action = agent.act(state, eps)
                env_info = env.step(action)[brain_name]
                next_state = env_info.vector_observations  # get the next state
                reward = env_info.rewards  # get the reward
                done = env_info.local_done  # see if episode has finished
                score += env_info.rewards
                agent.step(state, action, reward, next_state, done)
                state = next_state

                eps = eps - config.LIN_EPS_DECAY
                eps = np.maximum(eps, config.EPS_END)

                if np.any(done):
                    break

            scores_window.append(score)
            scores.append(score)

            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)),
                  end="")
            if i_episode % 2 == 0:
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                    i_episode, np.mean(scores_window)))

            mean = np.mean(scores_window)
            if mean > 30.0 and mean <= 31.5:
                print(
                    '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                    .format(i_episode - 100, np.mean(scores_window)))
                torch.save(agent.actor_local.state_dict(),
                           'solved_actor_trained_model.pth')
                torch.save(agent.critic_local.state_dict(),
                           'solved_critic_trained_model.pth')

        torch.save(agent.actor_local.state_dict(), 'actor_trained_model.pth')
        torch.save(agent.critic_local.state_dict(), 'critic_trained_model.pth')
        return scores

    except KeyboardInterrupt:
        torch.save(agent.actor_local.state_dict(),
                   'interrupt_actor_trained_model.pth')
        torch.save(agent.critic_local.state_dict(),
                   'interrupt_critic_trained_model.pth')
        plot_score_chart(scores)
        sys.exit(0)
示例#2
0
def ddpg_dual(n_episodes=5000, max_t=2000, solved_at=0.5):

    sharedActor = Agent(state_size=state_size,
                        action_size=action_size,
                        random_seed=2)

    avg_score = []
    scores_deque = deque(maxlen=100)

    best_score = 0.0
    env_solved = False

    for i_episode in range(1, n_episodes + 1):
        scores = np.zeros(num_agents)
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations  # get the current state (for each agent)

        sharedActor.reset()

        for t in range(max_t):
            actions = sharedActor.act(states)

            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations  # get next state (for each agent)
            rewards = env_info.rewards  # get reward (for each agent)
            dones = env_info.local_done  # see if episode finished

            for state, action, reward, next_state, done in zip(
                    states, actions, rewards, next_states, dones):
                sharedActor.step(state, action, reward, next_state, done, t)

            states = next_states
            scores += rewards  # update the score (for each agent)

            if np.any(dones):  # exit loop if episode finished
                break

        score = np.max(scores)
        avg_score.append(score)
        scores_deque.append(score)

        print('\rEpisode:{} \tScore:{:.3f} \tAverage Score: {:.3f} solved:{}'.
              format(i_episode, score, np.mean(scores_deque), env_solved),
              end="")
        if i_episode % 10 == 0:
            print("\n")

        if score > best_score and np.mean(scores_deque) >= solved_at:
            if env_solved == False:
                env_solved = True
                print(
                    '\nEnv solved in {:.3f} episodes!\tAverage Score ={:.3f} over last {} Episodes'
                    .format(i_episode - 100, np.mean(scores_deque), 100))
            torch.save(sharedActor.actor_local.state_dict(), "actor.pth")
            torch.save(sharedActor.critic_local.state_dict(), "critic.pth")
            best_score = score
            break

    return avg_score
class DdpgDeepleng():
    def __init__(self):
        # Create the Gym environment
        self.env = gym.make('DeeplengDocking-v1')
        rospy.loginfo("Gym environment done")
        self.agent = Agent(state_size=13, action_size=3, random_seed=2)

        # Set the logging system
        rospack = rospkg.RosPack()
        pkg_path = rospack.get_path('deepleng_control')
        outdir = pkg_path + '/training_results'
        # env = wrappers.Monitor(env, outdir, force=True)
        # rospy.loginfo("Monitor Wrapper started")
        self.max_episodes = 200
        self.max_timesteps = 1000

    def __call__(self, *args, **kwargs):
        scores = []
        for episode in range(1, self.max_episodes + 1):
            state = self.env.reset()
            self.agent.reset()
            score = 0
            print(
                "=========================================================================="
            )
            print("Episode no. {}".format(episode))
            print(
                "=========================================================================="
            )
            for stp in range(1, self.max_timesteps + 1):
                # print("___________________________________________________________________________")
                print("Step no. {}".format(stp))
                # print("Current state: {}".format([round(elem, 2) for elem in state]))
                print("Current state: {}".format(state))
                action = self.agent.act(np.array(state))
                print("Action taken: {}".format(action))
                next_state, reward, done, _ = self.env.step(action)
                print("Reward for action: {}".format(reward))
                print("Next state: {}".format(next_state))
                self.agent.step(state, action, reward, next_state, done)
                state = np.array(next_state)
                score += reward
                if done:
                    # print("Done")
                    break
                print(
                    "___________________________________________________________________________"
                )
            scores.append(score)
            torch.save(
                self.agent.actor_local.state_dict(),
                '/home/dfki.uni-bremen.de/mpatil/Desktop/checkpoint_actor.pth')
            torch.save(
                self.agent.critic_local.state_dict(),
                '/home/dfki.uni-bremen.de/mpatil/Desktop/checkpoint_critic.pth'
            )
        self.env.close()
        return scores
def ddpg(n_episodes=2000, store_every=10):
    scores_deque = deque(maxlen=store_every)
    scores = []

    agents = Agent(state_size=state_size,
                   action_size=action_size,
                   num_agents=num_agents,
                   random_seed=0)

    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=GRAPHICS_OFF)[brain_name]
        state = env_info.vector_observations
        agents.reset()
        score = np.zeros(num_agents)
        while True:
            action = agents.act(state)

            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done

            agents.step(state, action, rewards, next_state, dones)
            state = next_state
            score += rewards

            if np.any(dones):
                break
        scores_deque.append(np.mean(score))
        scores.append(np.mean(score))
        avg_score = np.mean(scores_deque)

        print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}\t {}'.format(
            i_episode, np.mean(scores_deque), np.mean(score),
            strftime("%H:%M:%S", gmtime())),
              end="")
        if i_episode % store_every == 0 or avg_score >= TARGET_SCORE:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, avg_score))

            if avg_score >= TARGET_SCORE:
                torch.save(agents.actor_local.state_dict(),
                           "ckpt/{}".format(ACTOR_CHECKPOINT_NAME))
                torch.save(agents.critic_local.state_dict(),
                           "ckpt/{}".format(CRITIC_CHECKPOINT_NAME))
                break

    return scores
示例#5
0
def ddpg_train(n_episodes, seed, buffer_size, batch_size, gamma, tau, lr_actor,
               lr_critic, weight_decay):
    scores = []
    scores_deque = deque(maxlen=100)
    agent = Agent(n_agents, state_size, action_size, seed, buffer_size,
                  batch_size, gamma, tau, lr_actor, lr_critic, weight_decay)
    load(agent)
    for i_episode in range(n_episodes):
        env_info = env.reset(
            train_mode=True)[brain_name]  # reset the environment
        states = env_info.vector_observations
        agent.reset()  # reset the agent noise
        score = np.zeros(n_agents)
        while True:
            actions = agent.act(states)
            # send the action to the environment
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations  # get the next state
            rewards = env_info.rewards  # get the reward
            dones = env_info.local_done  # see if episode has finished
            agent.step(states, actions, rewards, next_states, dones)
            score += rewards  # update the score
            states = next_states  # roll over the state to next time step
            if np.any(dones):  # exit loop if episode finished
                break
        scores.append(np.mean(score))
        scores_deque.append(np.mean(score))
        print('\rEpisode: \t{} \tScore: \t{:.2f} \tAverage Score: \t{:.2f}'.
              format(i_episode, np.mean(score), np.mean(scores_deque)),
              end="")
        if n_episodes % 10 == 0:
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(),
                       'checkpoint_critic.pth')
        if np.mean(scores_deque) >= 30.0:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode, np.mean(scores_deque)))
            break
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.grid()
    ax.plot(np.arange(len(scores)), scores)
    ax.set(xlabel="Episode #", ylabel="'Score", title="DDPG Network")
    fig.savefig("ddpg_network.pdf")
示例#6
0
class Worker(mp.Process):
    def __init__(self, gnet, opt, global_ep, global_ep_r, res_queue, name):
        super(Worker, self).__init__()
        self.name = 'w{}'.format(name)
        self.g_ep, self.g_ep_r, self.res_queue = global_ep, global_ep_r, res_queue
        self.gnet, self.opt = gnet, opt
        self.agent = Agent(state_size, action_size, gnet['actor'], gnet['critic'] \
        , opt['actor_optimizer'], opt['critic_optimizer'], random_seed)           # local agent

        self.env = gym.make('LunarLanderContinuous-v2')

    def run(self):
        total_step = 1
        while self.g_ep.value < MAX_EP:
            state = self.env.reset()
            ep_r = 0.
            self.agent.reset()

            for t in range(MAX_EP_STEP):
                # if self.name == 'w1':
                #     self.env.render()

                action = self.agent.act(state)
                action = np.clip(action, -1, 1)
                next_state, reward, done, _ = self.env.step(action)
                self.agent.step(state, action, reward, next_state, done, t)

                if t == MAX_EP_STEP - 1:
                    done = True

                ep_r += reward

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # time to sync

                    if done:  # done and print information
                        record(self.g_ep, self.g_ep_r, ep_r, self.res_queue,
                               self.name)
                        break

                state = next_state
                total_step += 1
示例#7
0
def ddpg(n_episodes=1000, max_t=500, print_every=100):
    scores_deque = deque(maxlen=print_every)
    scores = []

    # Create the env and the agent
    terminating_angle = 15
    env = CubeEnv(np.deg2rad(terminating_angle))
    agent = Agent(state_size=3, action_size=1, random_seed=2)

    plotter = LivePlotter(env, max_t, terminating_angle, n_episodes)

    for i_episode in range(1, n_episodes + 1):
        state = env.reset()
        agent.reset()
        score = 0
        done = False
        plotter.reset()
        while not done:
            # Select the next action and update system
            action = agent.act(state) * 10
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)

            # Update plots and metrics
            state = next_state
            score += reward
            plotter.add_data_from_env(env)

        scores_deque.append(score)
        scores.append(score)
        plotter.add_score(score)
        print('\rEpisode {}\tScore: {}'.format(i_episode, score), end="")

        # Display the plotrs
        plotter.display()

        # Save model
        torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
        torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')

    return scores
示例#8
0
#  return scores

#scores = ddpg()
n_episodes = 15
max_t = 300
print_every = 100
scores_deque = deque(maxlen=print_every)
scores = []
for i_episode in range(1, n_episodes + 1):
    state = env.reset()
    agent.reset()
    score = 0
    for t in range(max_t):
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        agent.step(state, action, reward, next_state, done)
        state = next_state
        score += reward

        if done:
            break
    scores_deque.append(score)
    scores.append(score)

    print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode,
                                                       np.mean(scores_deque)),
          end="")
    torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
    torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
    if i_episode % print_every == 0:
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
示例#9
0
def ddpg(n_episodes=500,
         max_t=1000,
         start_steps=10,
         learn_frequency=20,
         learn_count=10,
         random_seed=1):
    """Deep Deterministic Policy Gradient (DDPG)

    Params
    ======
        n_episodes (int)      : maximum number of training episodes
        max_t (int)           : maximum number of timesteps per episode
        start_steps (int)     : number of starting steps actions are chosen randomly
        learn_frequency (int) : frequency of learning per timestep
        learn_count (int)     : number of learning steps to do at learning timestep
        random_seed (int)     : random seed for agent's weights
    """

    agent = Agent(state_size=state_size,
                  action_size=action_size,
                  random_seed=random_seed)  #Initialize the Agent

    avg_scores_episode = []  # list containing scores from each episode
    avg_scores_moving = [
    ]  # list containing avg scores from window at each episode
    scores_window = deque(maxlen=100)  # last 100 scores

    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]  # reset environment
        states = env_info.vector_observations  # get current state for each agent
        scores = np.zeros(num_agents)  # initialize score for each agent
        agent.reset()  # reset noise of the agent

        for t in range(max_t):
            #Randomly sample actions during the starting steps
            if i_episode <= start_steps:
                actions = np.random.randn(
                    num_agents, action_size)  # select an action randomly
                actions = np.clip(actions, -1,
                                  1)  # all actions between -1 and 1
            else:
                actions = agent.act(
                    states, add_noise=True
                )  # select an action according to policy (for each agent)
            env_info = env.step(actions)[
                brain_name]  # send actions to environment
            next_states = env_info.vector_observations  # get next state (for each agent)
            rewards = env_info.rewards  # get reward (for each agent)
            dones = env_info.local_done  # see if episode has finished (for each agent)

            # for each agent's experience, save it and learn
            for state, action, reward, next_state, done in zip(
                    states, actions, rewards, next_states, dones):
                if t % learn_frequency == 0:  # Learn with frequency
                    agent.step(state,
                               action,
                               reward,
                               next_state,
                               done,
                               learn=True,
                               learn_count=learn_count)
                else:
                    agent.step(state,
                               action,
                               reward,
                               next_state,
                               done,
                               learn=False)  #just add, don't learn

            states = next_states

            scores += rewards  # add the rewards from the timestep to the scores
            if np.any(
                    dones
            ):  # finish episode if any agent has reached a terminal state
                break

        scores_window.append(
            np.mean(scores))  # save the most recent score to scores window

        avg_scores_episode.append(
            np.mean(scores))  # save the most recent score to avg_scores
        avg_scores_moving.append(
            np.mean(scores_window)
        )  # save the most recent score window average to moving averages

        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
            i_episode, np.mean(scores_window)),
              end="")
        if i_episode % 1 == 0:  # Print every episode
            print(
                '\rEpisode {}\tAverage Score: {:.2f} \t Current Score: {:.2f}'.
                format(i_episode, np.mean(scores_window), np.mean(scores)))

        #environment is solved
        if np.mean(scores_window) >= 30.0:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode - 100, np.mean(scores_window)))
            torch.save(agent.actor_local.state_dict(),
                       "checkpoint_actor.pth")  #Save actors' weights
            torch.save(agent.critic_local.state_dict(),
                       "checkpoint_critic.pth")  #Save critics' weights
            break

    return avg_scores_episode, avg_scores_moving  # Return average score of each episode and moving average at that time
示例#10
0
def train(
    n_episodes,
    max_t,
    env_fp,
    no_graphics,
    seed,
    save_every_nth,
    buffer_size,
    batch_size,
    gamma,
    tau,
    lr_actor,
    lr_critic,
    weight_decay,
    log,
):
    log.info("#### Initializing environment...")
    # init environment
    env = UnityEnvironment(file_name=env_fp, no_graphics=no_graphics)

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    log.info(f"Number of agents: {num_agents}")

    # size of each action
    action_size = brain.vector_action_space_size
    log.info(f"Size of each action: {action_size}")

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    log.info(
        f"There are {states.shape[0]} agents. Each observes a state with length: {state_size}"
    )
    log.info(f"The state for the first agent looks like: {states[0]}")

    agent = Agent(
        num_agents=len(env_info.agents),
        state_size=state_size,
        action_size=action_size,
        buffer_size=buffer_size,
        batch_size=batch_size,
        gamma=gamma,
        tau=tau,
        lr_actor=lr_actor,
        lr_critic=lr_critic,
        weight_decay=weight_decay,
        random_seed=seed,
    )

    log.info("#### Training...")

    scores_deque = deque(maxlen=100)
    scores = []
    for i_episode in range(1, n_episodes + 1):
        brain_name = env.brain_names[0]
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        agent.reset()
        score = np.zeros((len(env_info.agents), 1))
        for t in range(max_t):
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            rewards = np.array(rewards).reshape((next_states.shape[0], 1))
            dones = env_info.local_done
            dones = np.array(dones).reshape((next_states.shape[0], 1))
            agent.step(states, actions, rewards, next_states, dones)
            score += rewards
            states = next_states
            if np.any(dones):
                break
        scores_deque.append(np.mean(score))
        scores.append(np.mean(score))
        print(
            "Episode {}\tAverage Score: {:.2f}\tScore: {:.2f}".format(
                i_episode, np.mean(scores_deque), scores[-1]),
            end="\r",
        )

        if i_episode % 100 == 0:
            print("\rEpisode {}\tAverage Score: {:.2f}".format(
                i_episode, np.mean(scores_deque)))
        if i_episode % save_every_nth == 0:
            save_checkpoint(
                state={
                    "episode": i_episode,
                    "actor_state_dict": agent.actor_local.state_dict(),
                    "critic_state_dict": agent.critic_local.state_dict(),
                    "scores_deque": scores_deque,
                    "scores": scores,
                },
                filename="checkpoint.pth",
            )
            plot_scores(
                scores=scores,
                title=f"Avg score over {len(env_info.agents)} agents",
                fname="avg_scores.png",
                savefig=True,
            )

        if np.mean(scores_deque) >= 30:
            torch.save(agent.actor_local.state_dict(), "checkpoint_actor.pth")
            torch.save(agent.critic_local.state_dict(),
                       "checkpoint_critic.pth")
            print(
                "\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}"
                .format(i_episode - 100, np.mean(scores_deque)))
            break
for episode in range(episodes):
    # Reset the enviroment
    cur_state = env.reset(seed=episode)

    score = 0

    for i in range(iterationss + 1):

        # Predict the best action for the current state.
        action = agent.act(cur_state, add_noise=True)

        # Action is performed and new state, reward, info are received.
        new_state, reward, done, info = env.step(action)
        print("episode: ", episode, " sample: ", i, " reward: ", reward)
        # current state, action, reward, new state are stored in the experience replay
        agent.step(cur_state, action, reward, new_state, done)

        # roll over new state
        cur_state = new_state

        score += reward
        if done:
            break

    scores_deque.append(score)
    scores.append(score)
    print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode,
                                                       np.mean(scores_deque)),
          end="")
    torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
    torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
 # Train until environment ends the episode
 while True:
     for env_agent_idx in range(num_agents):
         # Let deep learning agent act based on states
         actions[env_agent_idx] = agent.act(states[env_agent_idx])
     env_info = env.step(actions)[brain_name]
     next_states = env_info.vector_observations
     rewards = env_info.rewards
     dones = env_info.local_done
     for env_agent_idx in range(num_agents):
         # Save to replay buffer
         agent.memorize(states[env_agent_idx], actions[env_agent_idx], \
                    rewards[env_agent_idx], next_states[env_agent_idx], \
                    dones[env_agent_idx])
     # Learn
     agent.step()
     states = next_states
     score += np.sum(rewards) / len(rewards)
     if np.any(dones):
         break
 # Check and track scores
 scores_deque.append(score)
 scores.append(score)
 average_score = np.mean(scores_deque)
 print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(
     i_episode, average_score, score),
       end="")
 if i_episode % print_every == 0:
     print('\rEpisode {}\tAverage Score: {:.2f}'.format(
         i_episode, average_score))
 # Save coefficients to file if environment is solved with current network coefficients
示例#13
0
    env_info = env.reset(train_mode=True)[brain_name]  # reset the environment
    states = env_info.vector_observations  # get the current state (for each agent)
    scores = np.zeros(num_agents)  # initialize the score (for each agent)
    agent_1.reset()
    agent_2.reset()
    for t in range(t_max):
        actions_1 = agent_1.act(np.expand_dims(states[0], 0), True)
        actions_2 = agent_2.act(np.expand_dims(states[1], 0), True)
        # actions_1 = np.clip(actions_1, -1, 1)             # all actions between -1 and 1
        actions = np.concatenate((actions_1, actions_2))
        env_info = env.step(actions)[
            brain_name]  # send all actions to tne environment

        next_states, rewards, dones = env_info.vector_observations, env_info.rewards, env_info.local_done
        # for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
        agent_1.step(np.expand_dims(states[0], 0), actions_1, rewards[0],
                     np.expand_dims(next_states[0], 0), dones[0], t)
        agent_2.step(np.expand_dims(states[1], 0), actions_2, rewards[1],
                     np.expand_dims(next_states[1], 0), dones[1], t)

        scores += rewards  # update the score (for each agent)
        states = next_states  # roll over states to next time step
        if np.any(dones):  # exit loop if episode finished
            break

    score.append(np.max(scores))
    ev_score.append(np.mean(scores_deque))
    scores_deque.append(np.max(scores))
    print('Score (max over agents) from episode {}: {:.5f}'.format(
        i_episode, np.max(scores)),
          end='\r')
    if i_episode % print_every == 0 or np.mean(scores_deque) > 0.5:
            # Predict the best action for the current state.
            cur_state1 = np.delete(cur_state, 8)
            cur_state2 = np.delete(cur_state, 7)
            # print(cur_state[5:])
            action1 = agent1.act(cur_state1, add_noise=True)
            action2 = agent2.act(cur_state2, add_noise=True)
            # print(action1,action2)
            # Action is performed and new state, reward, info are received.
            new_state, reward1, reward2, done1, done2, info = env.step(
                action1, action2)

            # current state, action, reward, new state are stored in the experience replay
            new_state1 = np.delete(new_state, 8)
            new_state2 = np.delete(new_state, 7)
            agent1.step(cur_state1, action1, reward1, new_state1, done1)
            agent2.step(cur_state2, action2, reward2, new_state2, done2)
            # roll over new state
            cur_state = new_state

            if info.done1 and info.done2:
                shortfall_hist1 = np.append(shortfall_hist1,
                                            info.implementation_shortfall1)
                shortfall_deque1.append(info.implementation_shortfall1)

                shortfall_hist2 = np.append(shortfall_hist2,
                                            info.implementation_shortfall2)
                shortfall_deque2.append(info.implementation_shortfall2)
                break

        if (episode + 1
示例#15
0
def ddpg(model_number,
         UPD,
         BUFFER_SIZE,
         BATCH_SIZE,
         LR_ACTOR,
         LR_CRITIC,
         fc1_units,
         fc2_units,
         a_gradient_clipping,
         a_leaky,
         a_dropout,
         c_gradient_clipping,
         c_batch_norm,
         c_leaky,
         c_dropout,
         n_episodes=400,
         max_t=2000,
         print_every=100):
    """Deep Q-Learning.

    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        ...
    """

    agent = Agent(state_size, action_size, UPD, BUFFER_SIZE, BATCH_SIZE,
                  LR_ACTOR, LR_CRITIC, fc1_units, fc2_units,
                  a_gradient_clipping, a_leaky, a_dropout, c_gradient_clipping,
                  c_batch_norm, c_leaky, c_dropout, 0, 12345)
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=print_every)  # last 100 scores

    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]
        agent.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state, a_dropout, a_leaky)

            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]

            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward

            if done:
                break

        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score

        with open('results.txt', 'a') as output:
            output.writelines(\
            '{}, {}, {:.2f}, {:.2f}, {}, {}, {}, {:.4f}, {:.4f}, {}, {}, {}, {}, {}, {}, {}, {}, {} \n'.format(
            model_number, i_episode, np.mean(scores_window), score,
            UPD, BUFFER_SIZE, BATCH_SIZE,
            LR_ACTOR, LR_CRITIC,
            fc1_units, fc2_units,
            a_gradient_clipping, a_leaky, a_dropout,
            c_gradient_clipping, c_batch_norm, c_leaky, c_dropout))
            output.flush()

        print('\rModel nr: {}, Episode {}, avg. score: {:.2f}, score: {:.2f}'.format\
              (model_number, i_episode, np.mean(scores_window), score), end="")
        if i_episode % print_every == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:2f}'.format(
                i_episode, np.mean(scores_window), score))
        if np.mean(scores_window) >= 30.0:
            with open('./models/models_solved.txt', 'a') as solved:
                solved.writelines('{}, {} \n'.format(model_number, i_episode))
                solved.flush()
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode, np.mean(scores_window)))
            torch.save(
                agent.actor_local.state_dict(),
                './models/checkpoint_actor_' + str(model_number) + '.pth')
            torch.save(
                agent.critic_local.state_dict(),
                './models/checkpoint_critic_' + str(model_number) + '.pth')
            break
    return scores
示例#16
0
scores_deque = deque(maxlen=print_every)
scores_final = []
agent = Agent(state_size, action_size, num_agents, random_seed=2)
# ----------------------- training the agents ----------------------- #
for i_episode in range(n_episodes):
    env_info = env.reset(train_mode=False)[brain_name]  # reset the environment
    states = env_info.vector_observations  # get the current state (for each agent)
    scores = np.zeros(num_agents)  # initialize the score (for each agent)
    while True:
        actions = agent.act(states)  # select an action (for each agent)
        env_info = env.step(actions)[
            brain_name]  # send all actions to tne environment
        next_states = env_info.vector_observations  # get next state (for each agent) next_states shape:(2,24)
        rewards = env_info.rewards  # get reward (for each agent)
        dones = env_info.local_done  # see if episode finished
        agent.step(states, actions, rewards, next_states, dones)
        scores += env_info.rewards  # update the score (for each agent)
        states = next_states  # roll over states to next time step
        if np.any(dones):  # exit loop if episode finished
            break
        scores_deque.append(max(scores))
        scores_final.append(scores)
    print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode,
                                                       np.mean(scores)),
          end="")
    if i_episode % 100 == 0:
        print('Total score (averaged over agents) this episode: {}'.format(
            np.mean(scores_deque)))
    if np.mean(scores_deque) > 0.5:
        torch.save(agent.actor_local.state_dict(), './checkpoint_actor.pth')
        torch.save(agent.critic_local.state_dict(), './checkpoint_critic.pth')
agents = Agent(state_size=state_size,
               action_size=action_size,
               num_agents=num_agents,
               random_seed=0)
agents.actor_local.load_state_dict(
    torch.load("ckpt/{}".format(ACTOR_CHECKPOINT_NAME)))
agents.critic_local.load_state_dict(
    torch.load("ckpt/{}".format(CRITIC_CHECKPOINT_NAME)))

for i_episode in range(1, n_episodes + 1):
    print('Starting episode {}'.format(i_episode))
    env_info = env.reset(train_mode=GRAPHICS_OFF)[brain_name]
    state = env_info.vector_observations
    agents.reset()
    score = np.zeros(num_agents)
    while True:
        action = agents.act(state)

        env_info = env.step(action)[brain_name]
        next_state = env_info.vector_observations
        rewards = env_info.rewards
        dones = env_info.local_done

        agents.step(state, action, rewards, next_state, dones)
        state = next_state
        score += rewards

        if np.any(dones):
            print('Score: {}'.format(np.mean(score)))
            break
def ddpg(n_episodes=500, max_t=200, train_mode=True):
    env = UnityEnvironment(file_name='./1_agent/Reacher.app')
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    action_size = brain.vector_action_space_size
    env_info = env.reset(train_mode=train_mode)[brain_name]

    states = env_info.vector_observations

    agent = Agent(state_size=states.shape[1],
                  action_size=action_size,
                  random_seed=2)

    brain_name = env.brain_names[0]
    scores = []
    scores_deque = deque(maxlen=100)
    max_score = -np.Inf

    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=train_mode)[brain_name]
        num_agents = len(env_info.agents)
        #         agent.reset()
        score = 0
        states = env_info.vector_observations
        #         while True:
        for t in range(max_t):
            agent.reset()
            actions = agent.act(states)
            #             actions = np.clip(actions, -1,1)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            #             rewards = [1.0  if x > 0.0 else 0.0 for x in rewards]
            dones = env_info.local_done
            agent.step(states, actions, rewards, next_states, dones)
            states = next_states
            score += np.mean(env_info.rewards)
            if np.any(dones):
                break
        scores_deque.append(score)
        scores.append(score)
        print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(
            i_episode, np.mean(scores_deque), score),
              end="")

        if i_episode % 100 == 0:
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(),
                       'checkpoint_critic.pth')
            print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(
                i_episode, np.mean(scores_deque), score),
                  end="")
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_deque)))

        if np.mean(scores_deque) >= 30.0:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode, np.mean(scores_deque)))
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(),
                       'checkpoint_critic.pth')
            break

    env.close()
    return scores
示例#19
0
total_episodes = []
for i_episode in tqdm(range(1, n_episodes + 1)):
    env_info = env.reset(train_mode=True)[brain_name]  # reset the environment
    t = 1
    states = env_info.vector_observations  # get the current state
    agent.reset()  # Reset noise with different inertia
    scores = np.zeros(num_agents)
    last_non_zeros_in_batch = 0
    while True:
        actions = agent.act(states)
        env_info = env.step(actions)[
            brain_name]  # send all actions to tne environment
        rewards = env_info.rewards  # get reward (for each agent)
        next_states = env_info.vector_observations  # get next state (for each agent)
        dones = env_info.local_done  # see if episode finished
        agent.step(states, actions, rewards, next_states, dones, t, i_episode)
        scores += rewards
        states = next_states

        if any(dones):
            break

        t += 1
    scores_deque.append(np.max(scores))
    scores_by_episode.append(np.max(scores))

    total_episodes.append(i_episode)

    if i_episode % print_every == 0:
        print(
            '\rEpisode {}\tRolling Average: {:.4f}\tScore: {:.2f}\tsteps: {}\t'
示例#20
0
def train_ddpg(dev,
               weights_file_actor,
               weights_file_critic,
               n_episodes=1000,
               max_t=1000):
    """DDPG Learning.

    Params
    ======
        dev (string): cpu or gpu
        weights_file_actor (string): name of the file to save the weights of the actor
        weights_file_critic (string): name of the file to save the weights of the critic
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
    """
    scores = [
    ]  # list containing scores from each episode (average of all the agents)
    averages = [
    ]  # list containing averages of the scores. Position i (1-index) has the average of the last min(i,100) episodes
    scores_window = deque(
        maxlen=100)  # last 100 averaged scores for all the agents
    env = UnityEnvironment(file_name='./Tennis_Linux/Tennis.x86_64')
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    env_info = env.reset(train_mode=True)[brain_name]
    num_agents = len(env_info.agents)
    states = env_info.vector_observations
    state_size = states.shape[1]
    action_size = brain.vector_action_space_size
    agent = Agent(state_size, action_size, random_seed=0, device=dev)

    print('Number of agents: {:d}'.format(num_agents))
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        agent.reset()  # reset noise for the actions
        states = env_info.vector_observations
        current_scores = np.zeros(
            num_agents)  # initialize the score for all the agents
        for t in range(max_t):
            actions = agent.act(states)  # process the states of all the agents

            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done

            agent.step(states, actions, rewards, next_states, dones)
            states = next_states
            current_scores += rewards
            if np.any(dones):
                break
        max_score = np.max(
            current_scores)  # current maximum score of all the agents
        scores.append(max_score)
        scores_window.append(max_score)
        averages.append(np.mean(scores_window))
        if (i_episode % 100 != 0):
            print('\rEpisode {}\tScore: {:.3f}\tAverage Score: {:.3f}'.format(
                i_episode, max_score, averages[i_episode - 1]),
                  end="")
        else:
            print('\rEpisode {}\tScore: {:.3f}\tAverage Score: {:.3f}'.format(
                i_episode, max_score, averages[i_episode - 1]))
        if (averages[i_episode - 1] >= 0.5):
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'
                .format(i_episode - 100, averages[i_episode - 1]))
            torch.save(agent.actor_local.state_dict(), weights_file_actor)
            torch.save(agent.critic_local.state_dict(), weights_file_critic)
            break

    env.close()
    return scores, averages
def main():
    env = UnityEnvironment(file_name='Reacher.app')
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    env_info = env.reset(train_mode=True)[brain_name]
    num_agents = len(env_info.agents)
    action_size = brain.vector_action_space_size
    states = env_info.vector_observations
    state_size = states.shape[1]

    agent = Agent(state_size=state_size,
                  action_size=action_size,
                  random_seed=3)

    scores_deque = deque(maxlen=100)
    scores = []

    for i_episode in range(1, 1000):
        begin = time.time()
        curr_scores = np.zeros(
            num_agents)  # initialize the score (for each agent)
        env_info = env.reset(
            train_mode=True)[brain_name]  # reset the environment
        states = env_info.vector_observations  # get the current state (for each agent)

        agent.reset()

        for t in range(1000):
            actions = agent.act(states)
            env_info = env.step(actions)[
                brain_name]  # send all actions to the environment
            next_states = env_info.vector_observations  # get next state (for each agent)
            rewards = env_info.rewards  # get reward (for each agent)
            dones = env_info.local_done  # see if episode finished

            agent.step(states, actions, rewards, next_states, dones, t)

            states = next_states
            curr_scores += rewards

            if np.any(dones):
                break

        curr_score = np.mean(curr_scores)
        scores_deque.append(curr_score)
        average_score = np.mean(scores_deque)
        scores.append(curr_score)

        print(
            '\rEpisode {}\tTime: {:.2f}\tAvg: {:.2f}\tScore: {:.2f}\tMin {:.2f}\tMax {:.2f}'
            .format(i_episode,
                    time.time() - begin, average_score, curr_score,
                    min(curr_scores), max(curr_scores)))
        if i_episode % 10 == 0:
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(),
                       'checkpoint_critic.pth')
        if average_score >= 30.0:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode - 100, average_score))
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(),
                       'checkpoint_critic.pth')
            break

    env.close()

    return
示例#22
0
def train(config, n_episodes=1000, base_port=5005, save_path=None, name=None):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    writer = SummaryWriter(comment=name)
    env = UnityEnvironment(
        file_name="Reacher_Linux_NoVis/Reacher.x86_64",
        no_graphics=True,
        base_port=base_port)
    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    dummy_input = (torch.zeros(1, config.num_agents, config.state_size),)
    agent = Agent(config)
    writer.add_graph(agent.actor_local, dummy_input, True)
    #writer.add_graph(agent.critic_local, dummy_input, True)
    
    num_agents = config.num_agents

    # reset
    env_info = env.reset(train_mode=True)[brain_name]
    
    episode_scores = []                        # list containing scores from each episode
    episode_scores_window = deque(maxlen=100)  # last 100 scores
    
    with trange(n_episodes, desc='episode') as episode_bar:
        for episode in episode_bar:
            env_info = env.reset(train_mode=True)[brain_name]
            states = env_info.vector_observations                  # get the current state (for each agent)
            scores = np.zeros(num_agents)                          # initialize the score (for each agent)

            while True:
                actions = agent.act(states)                        # select an action (for each agent)
                env_info = env.step(actions)[brain_name]           # send all actions to tne environment
                next_states = env_info.vector_observations         # get next state (for each agent)
                rewards = env_info.rewards                         # get reward (for each agent)
                dones = env_info.local_done                        # see if episode finished
                agent.step(states, actions, rewards, next_states, dones, writer=writer) # learn
                scores += env_info.rewards                         # update the score (for each agent)
                states = next_states                               # roll over states to next time step
                if np.any(dones):                                  # exit loop if episode finished
                    break

            episode_scores_window.append(np.mean(scores))       # save most recent score
            episode_scores.append(np.mean(scores))              # save most recent score
            episode_bar.set_postfix(avg_score=np.mean(episode_scores_window))      
            writer.add_scalar('data/score', np.mean(scores), episode)

        results = pd.Series(episode_scores, name=name)
        if save_path:
            torch.save(agent.actor_local.state_dict(), os.path.join(save_path,'checkpoint_actor.pth'))
            torch.save(agent.critic_local.state_dict(), os.path.join(save_path, 'checkpoint_critic.pth'))
            results.to_csv(os.path.join(save_path,'results.csv'))    
        env.close()
        writer.close()
        return results, agent
示例#23
0
def training():
    # config parameters

    number_of_episodes = 4000
    episode_length = 1000

    random_seed = 4  #np.random.randint(10000)

    # create env, get essential env info
    env, brain_name, num_agents, action_size, state_size = create_env()

    agent_reward = [[] for _ in range(num_agents)]
    agent_reward_deque = [deque(maxlen=100) for _ in range(num_agents)]
    score_full = []
    score_deque = deque(maxlen=100)

    # create ddpg agent for self play
    agents = Agent(state_size, action_size, random_seed, num_agents)

    for i_episode in range(1, number_of_episodes + 1):
        # reset the environment and get initial observation
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations

        # reshape states, assume each agent can see global condition
        #states = np.reshape(states,(1,-1))

        # reset ddpg agents
        #for agent in agents:
        #    agent.reset()
        agents.reset()
        episode_scores = np.zeros(num_agents)

        for t in range(episode_length):
            actions = []
            #for ii in range(num_agents):
            #    actions.append(agents.act(states[ii]))
            actions = agents.act(states)
            env_actions = actions  #np.reshape(np.array(actions),(1,-1))

            # play one step
            env_info = env.step(env_actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            episode_scores += rewards

            # store transition, learn if necessary
            for state, action, reward, next_state, done in zip(
                    states, actions, rewards, next_states, dones):
                agents.step(state, action, reward, next_state, done, t)

            states = next_states

            if np.any(dones):
                break

        for i in range(num_agents):
            agent_reward[i].append(episode_scores[i])
            agent_reward_deque[i].append(episode_scores[i])

        score_full.append(max(episode_scores))
        score_deque.append(max(episode_scores))

        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(score_deque)))

        if np.mean(score_deque) >= 0.5:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode, np.mean(score_deque)))
            #for i in range(num_agents):
            #    torch.save(agents[i].actor_local.state_dict(), 'checkpoint_actor'+str(i) +'.pth')
            #    torch.save(agents[i].critic_local.state_dict(), 'checkpoint_critic'+str(i)+'.pth')
            torch.save(agents.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agents.critic_local.state_dict(),
                       'checkpoint_critic.pth')
            break
    env.close()
    return agents, agent_reward, score_full, random_seed
示例#24
0
        # determine actions for the unity agents from current sate, using noise for exploration
        actions_1 = agent_1.act(states, add_noise=True)
        actions_2 = agent_2.act(states, add_noise=True)

        # send the actions to the unity agents in the environment and receive resultant environment information
        actions = np.concatenate((actions_1, actions_2), axis=0)
        actions = np.reshape(actions, (1, 4))
        env_info = env.step(actions)[brain_name]

        next_states = env_info.vector_observations  # get the next states for each unity agent in the environment
        next_states = np.reshape(next_states, (1, 48))
        rewards = env_info.rewards  # get the rewards for each unity agent in the environment
        dones = env_info.local_done  # see if episode has finished for each unity agent in the environment

        #Send (S, A, R, S') info to the training agent for replay buffer (memory) and network updates
        agent_1.step(states, actions_1, rewards[0], next_states, dones[0])
        agent_2.step(states, actions_2, rewards[1], next_states, dones[1])

        # set new states to current states for determining next actions
        states = next_states
        #print(states)
        # Update episode score for each unity agent
        agent_scores += rewards

        # If any unity agent indicates that the episode is done,
        # then exit episode loop, to begin new episode
        if np.any(dones):
            break

    # Add episode score to Scores and...
    # Calculate mean score over last 100 episodes
示例#25
0
for ep in range(0, episodes):
	env_info = env.reset(train_mode=True)[brain_name]
	states = env_info.vector_observations

	# state = env.reset()
	agent.reset()  # Resets the noise in the agent
	scores = np.zeros(num_agents)
	# Step through time steps and learn the actor and critic
	for t in range(max_time):

		actions = agent.act(states)									# Get actions from policy (for each agent)
		env_info = env.step(actions)[brain_name]           			# Perform actions in environment
		next_states = env_info.vector_observations					# get next state (for each agent)
		rewards = env_info.rewards                         			# get reward (for each agent)
		dones = env_info.local_done									# get dones (for each agent)
		agent.step(states, actions, rewards, next_states, dones)	# Add experience to buffer
		states = next_states										# Reset states
		scores += env_info.rewards									# Accumulate rewards
		if np.any(dones):
			break 

	scores_deque.append(np.mean(scores))
	all_scores.append(np.mean(scores))  # Add to total list of scores

	# Print results as they are computed
	mn_score = np.mean(scores_deque)
	print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(ep+1, mn_score, np.mean(scores)), end="")
	if ep+1 % 100 == 0 or mn_score > max_score:
		torch.save({'local': agent.actor_local.state_dict(),
					'target': agent.actor_target.state_dict(),
					'opt': agent.actor_optimizer.state_dict()}, 'cc_actor.pth')
def multi_agent_ddpg(env, brain_name, title, n_episodes, action_size,
                     state_size, num_agents, print_every, n_updates,
                     update_intervals, device):

    # create save dir for this experiment
    if title is None:
        title = "experiment"
    current_time = strftime("%Y-%m-%d_%H:%M:%S", gmtime())
    title = title + "_" + current_time

    # write a new file
    os.makedirs("experiments/{}".format(title), exist_ok=True)
    f = open("experiments/{}/scores.txt".format(title), "w")
    f.close()

    all_agents_statesize = state_size * num_agents

    agent1 = Agent(state_size=all_agents_statesize,
                   action_size=action_size,
                   num_agents=1,
                   random_seed=123,
                   device=device)
    agent2 = Agent(state_size=all_agents_statesize,
                   action_size=action_size,
                   num_agents=1,
                   random_seed=123,
                   device=device)

    scores_deque = deque(maxlen=100)
    mean_scores = []

    for i_episode in range(1, n_episodes + 1):

        env_info = env.reset(train_mode=True)[brain_name]

        states = env_info.vector_observations
        states = np.reshape(
            states,
            (1, all_agents_statesize
             ))  # reshape so we can feed both agents states to each agent

        # reset
        agent1.reset()
        agent2.reset()

        # place to store scores
        agent_scores = np.zeros(num_agents)
        t = 0
        while True:
            # two agents actions
            actions_1 = agent1.act(states, add_noise=True)
            actions_2 = agent2.act(states, add_noise=True)

            # step environment for two agents and get next states
            actions = np.concatenate((actions_1, actions_2), axis=0)
            actions = np.reshape(actions, (1, 4))
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            next_states = np.reshape(next_states, (1, all_agents_statesize))
            rewards = env_info.rewards
            dones = env_info.local_done

            # update the agents accordingly (ddpg)
            agent1.step(states, actions_1, rewards[0], next_states, dones[0],
                        n_updates, update_intervals, t)
            agent2.step(states, actions_2, rewards[1], next_states, dones[1],
                        n_updates, update_intervals, t)

            states = next_states
            agent_scores += rewards

            if np.any(dones):
                break
            t += 1

        scores_deque.append(np.max(agent_scores))
        print('\rEpisode {}\tLast 100 average Score: {:.2f}'.format(
            i_episode, np.mean(scores_deque)),
              end="")

        # save score and model every print_every
        if i_episode % print_every == 0:
            f = open("experiments/{}/scores.txt".format(title), "a")
            f.write("{},{}\n".format(i_episode, np.mean(scores_deque)))
            f.close()
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_deque)))
            mean_scores.append(np.mean(scores_deque))
            # save if best model
            if np.mean(scores_deque) == max(mean_scores):
                torch.save(
                    agent1.actor_local.state_dict(),
                    'experiments/{}/checkpoint_actor1.pth'.format(title))
                torch.save(
                    agent1.critic_local.state_dict(),
                    'experiments/{}/checkpoint_critic1.pth'.format(title))
                torch.save(
                    agent2.actor_local.state_dict(),
                    'experiments/{}/checkpoint_actor2.pth'.format(title))
                torch.save(
                    agent2.critic_local.state_dict(),
                    'experiments/{}/checkpoint_critic2.pth'.format(title))

            if np.mean(scores_deque) >= 1.0 and i_episode > 100:
                print("\rEnvironment solved with average score of 30")
                break
示例#27
0
def ddpg(n_episodes=500, max_t=200, train_mode=True):
    env = UnityEnvironment(file_name='./env/Tennis.app')
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    action_size = brain.vector_action_space_size
    env_info = env.reset(train_mode=train_mode)[brain_name]

    states = env_info.vector_observations

    agent = Agent(state_size=states.shape[1],
                  action_size=action_size,
                  random_seed=2)

    brain_name = env.brain_names[0]
    scores = []
    scores_deque = deque(maxlen=100)
    scores_mean = []
    max_score = -np.Inf

    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=train_mode)[brain_name]
        num_agents = len(env_info.agents)
        #         agent.reset()
        score = np.zeros((2, ))
        states = env_info.vector_observations
        for t in range(max_t):
            agent.reset()
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            agent.step(states, actions, rewards, next_states, dones)
            states = next_states
            score += env_info.rewards
            if np.any(dones):
                break
        scores_deque.append(np.max(score))
        scores_mean.append(np.mean(scores_deque))
        scores.append(score)
        print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {}'.format(
            i_episode, np.mean(scores_deque), score),
              end="")

        if np.max(score) > max_score:
            torch.save(agent.actor_local.state_dict(),
                       'checkpoint_actor_best.pth')
            torch.save(agent.critic_local.state_dict(),
                       'checkpoint_critic_best.pth')
            print('\rSaving Weights for max score old: {} -> new: {} '.format(
                max_score, np.max(score)))
            max_score = np.max(score)

        if i_episode % 100 == 0:
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(),
                       'checkpoint_critic.pth')
            print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {}'.format(
                i_episode, np.mean(scores_deque), score),
                  end="")
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_deque)))


#         if np.mean(scores_deque)>=0.5:
#             print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
#             torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
#             torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
#             break

    env.close()
    return scores, scores_mean