class DdpgInfer():
    def __init__(self):
        self.env = gym.make('DeeplengDocking-v1')
        rospy.loginfo("Gym environment done")
        self.agent = Agent(state_size=13, action_size=3, random_seed=2)
        rospack = rospkg.RosPack()
        pkg_path = rospack.get_path('deepleng_control')
        self.outdir = pkg_path + '/training_results'
        self.agent.actor_local.load_state_dict(
            torch.load(self.outdir + '/checkpoint_actor.pth'))
        self.agent.critic_local.load_state_dict(
            torch.load(self.outdir + '/checkpoint_critic.pth'))

    def __call__(self, *args, **kwargs):

        state = self.env.reset()
        for t in range(500):
            action = self.agent.act(state, add_noise=False)
            # env.render()
            state, reward, done, _ = self.env.step(action)
            print("state:", state)
            print("Reward: ", reward)
            if done:
                break

        self.env.close()
Пример #2
0
def train(episodes=700, max_t=1000):
    try:
        agent = Agent(state_size=33, action_size=4, seed=0)
        scores = []
        scores_window = deque(maxlen=100)
        config = Config()
        eps = config.EPS_START

        for i_episode in range(1, episodes + 1):
            env_info = env.reset(train_mode=True)[brain_name]
            state = env_info.vector_observations
            agent.reset()
            score = np.zeros(1)
            for _ in range(max_t + 1):
                action = agent.act(state, eps)
                env_info = env.step(action)[brain_name]
                next_state = env_info.vector_observations  # get the next state
                reward = env_info.rewards  # get the reward
                done = env_info.local_done  # see if episode has finished
                score += env_info.rewards
                agent.step(state, action, reward, next_state, done)
                state = next_state

                eps = eps - config.LIN_EPS_DECAY
                eps = np.maximum(eps, config.EPS_END)

                if np.any(done):
                    break

            scores_window.append(score)
            scores.append(score)

            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)),
                  end="")
            if i_episode % 2 == 0:
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                    i_episode, np.mean(scores_window)))

            mean = np.mean(scores_window)
            if mean > 30.0 and mean <= 31.5:
                print(
                    '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                    .format(i_episode - 100, np.mean(scores_window)))
                torch.save(agent.actor_local.state_dict(),
                           'solved_actor_trained_model.pth')
                torch.save(agent.critic_local.state_dict(),
                           'solved_critic_trained_model.pth')

        torch.save(agent.actor_local.state_dict(), 'actor_trained_model.pth')
        torch.save(agent.critic_local.state_dict(), 'critic_trained_model.pth')
        return scores

    except KeyboardInterrupt:
        torch.save(agent.actor_local.state_dict(),
                   'interrupt_actor_trained_model.pth')
        torch.save(agent.critic_local.state_dict(),
                   'interrupt_critic_trained_model.pth')
        plot_score_chart(scores)
        sys.exit(0)
def play(env):
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    action_size = brain.vector_action_space_size
    env_info = env.reset(train_mode=False)[brain_name]
    states = env_info.vector_observations

    agent = Agent(state_size=states.shape[1],
                  action_size=action_size,
                  random_seed=2)

    agent.reset()

    agent.actor_local.load_state_dict(torch.load(ACTOR_WEIGHTS))
    agent.critic_local.load_state_dict(torch.load(CRITIC_WEIGHTS))
    scores = []
    score = np.zeros((2, ))
    while True:
        agent.reset()
        actions = agent.act(states)

        env_info = env.step(actions)[brain_name]
        states = env_info.vector_observations
        score += np.array(env_info.rewards)
        dones = env_info.local_done
        if np.sum(dones) > 0:
            break

    print('Scores: {}'.format(score))
Пример #4
0
def ddpg_dual(n_episodes=5000, max_t=2000, solved_at=0.5):

    sharedActor = Agent(state_size=state_size,
                        action_size=action_size,
                        random_seed=2)

    avg_score = []
    scores_deque = deque(maxlen=100)

    best_score = 0.0
    env_solved = False

    for i_episode in range(1, n_episodes + 1):
        scores = np.zeros(num_agents)
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations  # get the current state (for each agent)

        sharedActor.reset()

        for t in range(max_t):
            actions = sharedActor.act(states)

            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations  # get next state (for each agent)
            rewards = env_info.rewards  # get reward (for each agent)
            dones = env_info.local_done  # see if episode finished

            for state, action, reward, next_state, done in zip(
                    states, actions, rewards, next_states, dones):
                sharedActor.step(state, action, reward, next_state, done, t)

            states = next_states
            scores += rewards  # update the score (for each agent)

            if np.any(dones):  # exit loop if episode finished
                break

        score = np.max(scores)
        avg_score.append(score)
        scores_deque.append(score)

        print('\rEpisode:{} \tScore:{:.3f} \tAverage Score: {:.3f} solved:{}'.
              format(i_episode, score, np.mean(scores_deque), env_solved),
              end="")
        if i_episode % 10 == 0:
            print("\n")

        if score > best_score and np.mean(scores_deque) >= solved_at:
            if env_solved == False:
                env_solved = True
                print(
                    '\nEnv solved in {:.3f} episodes!\tAverage Score ={:.3f} over last {} Episodes'
                    .format(i_episode - 100, np.mean(scores_deque), 100))
            torch.save(sharedActor.actor_local.state_dict(), "actor.pth")
            torch.save(sharedActor.critic_local.state_dict(), "critic.pth")
            best_score = score
            break

    return avg_score
class DdpgDeepleng():
    def __init__(self):
        # Create the Gym environment
        self.env = gym.make('DeeplengDocking-v1')
        rospy.loginfo("Gym environment done")
        self.agent = Agent(state_size=13, action_size=3, random_seed=2)

        # Set the logging system
        rospack = rospkg.RosPack()
        pkg_path = rospack.get_path('deepleng_control')
        outdir = pkg_path + '/training_results'
        # env = wrappers.Monitor(env, outdir, force=True)
        # rospy.loginfo("Monitor Wrapper started")
        self.max_episodes = 200
        self.max_timesteps = 1000

    def __call__(self, *args, **kwargs):
        scores = []
        for episode in range(1, self.max_episodes + 1):
            state = self.env.reset()
            self.agent.reset()
            score = 0
            print(
                "=========================================================================="
            )
            print("Episode no. {}".format(episode))
            print(
                "=========================================================================="
            )
            for stp in range(1, self.max_timesteps + 1):
                # print("___________________________________________________________________________")
                print("Step no. {}".format(stp))
                # print("Current state: {}".format([round(elem, 2) for elem in state]))
                print("Current state: {}".format(state))
                action = self.agent.act(np.array(state))
                print("Action taken: {}".format(action))
                next_state, reward, done, _ = self.env.step(action)
                print("Reward for action: {}".format(reward))
                print("Next state: {}".format(next_state))
                self.agent.step(state, action, reward, next_state, done)
                state = np.array(next_state)
                score += reward
                if done:
                    # print("Done")
                    break
                print(
                    "___________________________________________________________________________"
                )
            scores.append(score)
            torch.save(
                self.agent.actor_local.state_dict(),
                '/home/dfki.uni-bremen.de/mpatil/Desktop/checkpoint_actor.pth')
            torch.save(
                self.agent.critic_local.state_dict(),
                '/home/dfki.uni-bremen.de/mpatil/Desktop/checkpoint_critic.pth'
            )
        self.env.close()
        return scores
def ddpg(n_episodes=2000, store_every=10):
    scores_deque = deque(maxlen=store_every)
    scores = []

    agents = Agent(state_size=state_size,
                   action_size=action_size,
                   num_agents=num_agents,
                   random_seed=0)

    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=GRAPHICS_OFF)[brain_name]
        state = env_info.vector_observations
        agents.reset()
        score = np.zeros(num_agents)
        while True:
            action = agents.act(state)

            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done

            agents.step(state, action, rewards, next_state, dones)
            state = next_state
            score += rewards

            if np.any(dones):
                break
        scores_deque.append(np.mean(score))
        scores.append(np.mean(score))
        avg_score = np.mean(scores_deque)

        print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}\t {}'.format(
            i_episode, np.mean(scores_deque), np.mean(score),
            strftime("%H:%M:%S", gmtime())),
              end="")
        if i_episode % store_every == 0 or avg_score >= TARGET_SCORE:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, avg_score))

            if avg_score >= TARGET_SCORE:
                torch.save(agents.actor_local.state_dict(),
                           "ckpt/{}".format(ACTOR_CHECKPOINT_NAME))
                torch.save(agents.critic_local.state_dict(),
                           "ckpt/{}".format(CRITIC_CHECKPOINT_NAME))
                break

    return scores
Пример #7
0
def ddpg_train(n_episodes, seed, buffer_size, batch_size, gamma, tau, lr_actor,
               lr_critic, weight_decay):
    scores = []
    scores_deque = deque(maxlen=100)
    agent = Agent(n_agents, state_size, action_size, seed, buffer_size,
                  batch_size, gamma, tau, lr_actor, lr_critic, weight_decay)
    load(agent)
    for i_episode in range(n_episodes):
        env_info = env.reset(
            train_mode=True)[brain_name]  # reset the environment
        states = env_info.vector_observations
        agent.reset()  # reset the agent noise
        score = np.zeros(n_agents)
        while True:
            actions = agent.act(states)
            # send the action to the environment
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations  # get the next state
            rewards = env_info.rewards  # get the reward
            dones = env_info.local_done  # see if episode has finished
            agent.step(states, actions, rewards, next_states, dones)
            score += rewards  # update the score
            states = next_states  # roll over the state to next time step
            if np.any(dones):  # exit loop if episode finished
                break
        scores.append(np.mean(score))
        scores_deque.append(np.mean(score))
        print('\rEpisode: \t{} \tScore: \t{:.2f} \tAverage Score: \t{:.2f}'.
              format(i_episode, np.mean(score), np.mean(scores_deque)),
              end="")
        if n_episodes % 10 == 0:
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(),
                       'checkpoint_critic.pth')
        if np.mean(scores_deque) >= 30.0:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode, np.mean(scores_deque)))
            break
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.grid()
    ax.plot(np.arange(len(scores)), scores)
    ax.set(xlabel="Episode #", ylabel="'Score", title="DDPG Network")
    fig.savefig("ddpg_network.pdf")
Пример #8
0
def main():
    env = UnityEnvironment(file_name='data/Reacher_Linux/Reacher.x86_64')
    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    # number of agents
    num_agents = len(env_info.agents)
    # size of each action
    action_size = brain.vector_action_space_size
    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]

    n_agent = 20
    agent = Agent(state_size=state_size,
                  action_size=action_size,
                  random_seed=2,
                  n_agent=n_agent)
    # load trained model
    agent.actor_local.load_state_dict(torch.load('model/checkpoint_actor.pth'))
    agent.critic_local.load_state_dict(
        torch.load('model/checkpoint_critic.pth'))

    state = env.reset()
    env_info = env.reset(train_mode=False)[brain_name]
    state = env_info.vector_observations
    for t in range(1000):
        action = [
            agent.act(state[agent_x], agent_x, add_noise=False)
            for agent_x in range(n_agent)
        ]
        env_info = env.step(action)[brain_name]
        next_state = env_info.vector_observations
        reward = env_info.rewards
        done = env_info.local_done
        state = next_state
        if all(done):
            break

    env.close()
Пример #9
0
def trained_agent():
    agent = Agent(n_agents, state_size, action_size, 0, 0, 0, 0, 0, 0, 0, 0)
    load(agent)
    for episode in range(3):
        env_info = env.reset(train_mode=False)[brain_name]
        states = env_info.vector_observations
        score = np.zeros(n_agents)
        while True:
            actions = agent.act(states, add_noise=False)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            score += rewards
            states = next_states
            if np.any(dones):
                break
        print('Episode: \t{} \tScore: \t{:.2f}'.format(episode,
                                                       np.mean(score)))
    env.close()
Пример #10
0
class Worker(mp.Process):
    def __init__(self, gnet, opt, global_ep, global_ep_r, res_queue, name):
        super(Worker, self).__init__()
        self.name = 'w{}'.format(name)
        self.g_ep, self.g_ep_r, self.res_queue = global_ep, global_ep_r, res_queue
        self.gnet, self.opt = gnet, opt
        self.agent = Agent(state_size, action_size, gnet['actor'], gnet['critic'] \
        , opt['actor_optimizer'], opt['critic_optimizer'], random_seed)           # local agent

        self.env = gym.make('LunarLanderContinuous-v2')

    def run(self):
        total_step = 1
        while self.g_ep.value < MAX_EP:
            state = self.env.reset()
            ep_r = 0.
            self.agent.reset()

            for t in range(MAX_EP_STEP):
                # if self.name == 'w1':
                #     self.env.render()

                action = self.agent.act(state)
                action = np.clip(action, -1, 1)
                next_state, reward, done, _ = self.env.step(action)
                self.agent.step(state, action, reward, next_state, done, t)

                if t == MAX_EP_STEP - 1:
                    done = True

                ep_r += reward

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # time to sync

                    if done:  # done and print information
                        record(self.g_ep, self.g_ep_r, ep_r, self.res_queue,
                               self.name)
                        break

                state = next_state
                total_step += 1
Пример #11
0
def ddpg(n_episodes=1000, max_t=500, print_every=100):
    scores_deque = deque(maxlen=print_every)
    scores = []

    # Create the env and the agent
    terminating_angle = 15
    env = CubeEnv(np.deg2rad(terminating_angle))
    agent = Agent(state_size=3, action_size=1, random_seed=2)

    plotter = LivePlotter(env, max_t, terminating_angle, n_episodes)

    for i_episode in range(1, n_episodes + 1):
        state = env.reset()
        agent.reset()
        score = 0
        done = False
        plotter.reset()
        while not done:
            # Select the next action and update system
            action = agent.act(state) * 10
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)

            # Update plots and metrics
            state = next_state
            score += reward
            plotter.add_data_from_env(env)

        scores_deque.append(score)
        scores.append(score)
        plotter.add_score(score)
        print('\rEpisode {}\tScore: {}'.format(i_episode, score), end="")

        # Display the plotrs
        plotter.display()

        # Save model
        torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
        torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')

    return scores
Пример #12
0
def test():
    agent = Agent(state_size=33, action_size=4, seed=0)
    load_model(agent.critic_local, 'solved_critic_trained_model.pth')
    load_model(agent.actor_local, 'solved_actor_trained_model.pth')
    env_info = env.reset(train_mode=False)[brain_name]

    state = env_info.vector_observations
    score = np.zeros(1)
    while True:
        action = agent.act(state, 0, False)
        env_info = env.step(action)[brain_name]
        next_state = env_info.vector_observations  # get the next state
        reward = env_info.rewards  # get the reward
        done = env_info.local_done  # see if episode has finished

        state = next_state
        score += reward

        if np.any(done):
            print('\r\tTest Score: {:.2f}'.format(score[0], end=""))
            break
print_every = 100
iterationss = 20

scores_deque = deque(maxlen=print_every)
scores = []

for episode in range(episodes):
    # Reset the enviroment
    cur_state = env.reset(seed=episode)

    score = 0

    for i in range(iterationss + 1):

        # Predict the best action for the current state.
        action = agent.act(cur_state, add_noise=True)

        # Action is performed and new state, reward, info are received.
        new_state, reward, done, info = env.step(action)
        print("episode: ", episode, " sample: ", i, " reward: ", reward)
        # current state, action, reward, new state are stored in the experience replay
        agent.step(cur_state, action, reward, new_state, done)

        # roll over new state
        cur_state = new_state

        score += reward
        if done:
            break

    scores_deque.append(score)
Пример #14
0
    states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

# traing paramenter
n_episodes = 4000
print_every = 100
scores_deque = deque(maxlen=print_every)
scores_final = []
agent = Agent(state_size, action_size, num_agents, random_seed=2)
# ----------------------- training the agents ----------------------- #
for i_episode in range(n_episodes):
    env_info = env.reset(train_mode=False)[brain_name]  # reset the environment
    states = env_info.vector_observations  # get the current state (for each agent)
    scores = np.zeros(num_agents)  # initialize the score (for each agent)
    while True:
        actions = agent.act(states)  # select an action (for each agent)
        env_info = env.step(actions)[
            brain_name]  # send all actions to tne environment
        next_states = env_info.vector_observations  # get next state (for each agent) next_states shape:(2,24)
        rewards = env_info.rewards  # get reward (for each agent)
        dones = env_info.local_done  # see if episode finished
        agent.step(states, actions, rewards, next_states, dones)
        scores += env_info.rewards  # update the score (for each agent)
        states = next_states  # roll over states to next time step
        if np.any(dones):  # exit loop if episode finished
            break
        scores_deque.append(max(scores))
        scores_final.append(scores)
    print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode,
                                                       np.mean(scores)),
          end="")
def main():

    # load version 2 (with 20 agents) of the environment
    env_name = "Tennis_Windows_x86_64\Tennis.exe" # add a Unity-Environment name.
    no_graphics = False
    env = UnityEnvironment(file_name = env_name, no_graphics = no_graphics)

    # Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the enviroment
    env_info = env.reset(train_mode = False)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    print("Number of agents : ", num_agents)

    # size of each action
    action_size = brain.vector_action_space_size
    print("Size of each action : ", action_size)

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    print("There are {} agents. Each observes a state with length: {}".format(states.shape[0], state_size))


    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    random_seed = 12345 #10
    agent = Agent(state_size, action_size, num_agents, random_seed, device=device)
    actor_state_dict1 = torch.load("checkpoint_actor1.pth")
    agent.actor_local1.load_state_dict(actor_state_dict1)
    critic_state_dict1 = torch.load("checkpoint_critic1.pth")
    agent.critic_local1.load_state_dict(critic_state_dict1)
    actor_state_dict2 = torch.load("checkpoint_actor2.pth")
    agent.actor_local2.load_state_dict(actor_state_dict2)
    critic_state_dict2 = torch.load("checkpoint_critic2.pth")
    agent.critic_local2.load_state_dict(critic_state_dict2)



    # Take Random Actions in the Environment
    env_info = env.reset(train_mode=False)[brain_name]      # reset the environment
    states = env_info.vector_observations                  # get the current state (for each agent)

    scores = np.zeros(num_agents)                          # initialize the score (for each agent)
    while True:
        actions = agent.act(states, add_noise = False) # select an action (for each agent)
        env_info = env.step(actions)[brain_name]           # send all actions to tne environment
        next_states = env_info.vector_observations         # get next state (for each agent)
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished

        scores += env_info.rewards                         # update the score (for each agent)
        states = next_states                               # roll over states to next time step
        if np.any(dones):                                  # exit loop if episode finished
            break
    print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))


    # When finished, you can close the environment
    env.close()
def main():
    env = UnityEnvironment(file_name='Reacher.app')
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    env_info = env.reset(train_mode=True)[brain_name]
    num_agents = len(env_info.agents)
    action_size = brain.vector_action_space_size
    states = env_info.vector_observations
    state_size = states.shape[1]

    agent = Agent(state_size=state_size,
                  action_size=action_size,
                  random_seed=3)

    scores_deque = deque(maxlen=100)
    scores = []

    for i_episode in range(1, 1000):
        begin = time.time()
        curr_scores = np.zeros(
            num_agents)  # initialize the score (for each agent)
        env_info = env.reset(
            train_mode=True)[brain_name]  # reset the environment
        states = env_info.vector_observations  # get the current state (for each agent)

        agent.reset()

        for t in range(1000):
            actions = agent.act(states)
            env_info = env.step(actions)[
                brain_name]  # send all actions to the environment
            next_states = env_info.vector_observations  # get next state (for each agent)
            rewards = env_info.rewards  # get reward (for each agent)
            dones = env_info.local_done  # see if episode finished

            agent.step(states, actions, rewards, next_states, dones, t)

            states = next_states
            curr_scores += rewards

            if np.any(dones):
                break

        curr_score = np.mean(curr_scores)
        scores_deque.append(curr_score)
        average_score = np.mean(scores_deque)
        scores.append(curr_score)

        print(
            '\rEpisode {}\tTime: {:.2f}\tAvg: {:.2f}\tScore: {:.2f}\tMin {:.2f}\tMax {:.2f}'
            .format(i_episode,
                    time.time() - begin, average_score, curr_score,
                    min(curr_scores), max(curr_scores)))
        if i_episode % 10 == 0:
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(),
                       'checkpoint_critic.pth')
        if average_score >= 30.0:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode - 100, average_score))
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(),
                       'checkpoint_critic.pth')
            break

    env.close()

    return
pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
# 2. overwrite entries in the existing state dict
model_dict.update(pretrained_dict)
# 3. load the new state dict
agent.actor_local.load_state_dict(pretrained_dict)

pretrained_dict = torch.load('checkpoint_critic.pth')
model_dict = agent.critic_local.state_dict()

# 1. filter out unnecessary keys
pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
# 2. overwrite entries in the existing state dict
model_dict.update(pretrained_dict)
# 3. load the new state dict
agent.critic_local.load_state_dict(pretrained_dict)

state = env.reset()
agent.reset()

while True:

    action = agent.act(state)[0]
    env.render()
    # time.sleep(0.1)
    next_state, reward, done, _ = (env.step(action))
    state = next_state

    if done:
        break

env.env.close()
Пример #18
0
def train_ddpg(dev,
               weights_file_actor,
               weights_file_critic,
               n_episodes=1000,
               max_t=1000):
    """DDPG Learning.

    Params
    ======
        dev (string): cpu or gpu
        weights_file_actor (string): name of the file to save the weights of the actor
        weights_file_critic (string): name of the file to save the weights of the critic
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
    """
    scores = [
    ]  # list containing scores from each episode (average of all the agents)
    averages = [
    ]  # list containing averages of the scores. Position i (1-index) has the average of the last min(i,100) episodes
    scores_window = deque(
        maxlen=100)  # last 100 averaged scores for all the agents
    env = UnityEnvironment(file_name='./Tennis_Linux/Tennis.x86_64')
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    env_info = env.reset(train_mode=True)[brain_name]
    num_agents = len(env_info.agents)
    states = env_info.vector_observations
    state_size = states.shape[1]
    action_size = brain.vector_action_space_size
    agent = Agent(state_size, action_size, random_seed=0, device=dev)

    print('Number of agents: {:d}'.format(num_agents))
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        agent.reset()  # reset noise for the actions
        states = env_info.vector_observations
        current_scores = np.zeros(
            num_agents)  # initialize the score for all the agents
        for t in range(max_t):
            actions = agent.act(states)  # process the states of all the agents

            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done

            agent.step(states, actions, rewards, next_states, dones)
            states = next_states
            current_scores += rewards
            if np.any(dones):
                break
        max_score = np.max(
            current_scores)  # current maximum score of all the agents
        scores.append(max_score)
        scores_window.append(max_score)
        averages.append(np.mean(scores_window))
        if (i_episode % 100 != 0):
            print('\rEpisode {}\tScore: {:.3f}\tAverage Score: {:.3f}'.format(
                i_episode, max_score, averages[i_episode - 1]),
                  end="")
        else:
            print('\rEpisode {}\tScore: {:.3f}\tAverage Score: {:.3f}'.format(
                i_episode, max_score, averages[i_episode - 1]))
        if (averages[i_episode - 1] >= 0.5):
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'
                .format(i_episode - 100, averages[i_episode - 1]))
            torch.save(agent.actor_local.state_dict(), weights_file_actor)
            torch.save(agent.critic_local.state_dict(), weights_file_critic)
            break

    env.close()
    return scores, averages
Пример #19
0
def test(dev,
         weights_file_actor,
         weights_file_critic,
         n_episodes=100,
         max_t=1000):
    """Test the environment with the parameters stored in checkpoint.pth

    Params
    ======
        dev (string): cpu or gpu
        weights_file_actor (string): name of the file to load the weights of the actor
        weights_file_critic (string): name of the file to load the weights of the critic
        n_episodes (int): number of test episodes that will be performed
    """
    env = UnityEnvironment(file_name='./Tennis_Linux/Tennis.x86_64')
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=False)[brain_name]
    num_agents = len(env_info.agents)
    states = env_info.vector_observations
    state_size = states.shape[1]
    action_size = brain.vector_action_space_size
    agent = Agent(state_size, action_size, random_seed=0, device=dev)
    scores = []

    # load the weights from file
    print('Number of agents: {:d}'.format(num_agents))
    print('Loading weights')
    try:
        checkpoint_actor = torch.load(weights_file_actor)
    except FileNotFoundError:
        print('Error: File \'{}\' not found'.format(weights_file_actor))
        sys.exit(1)
    try:
        checkpoint_critic = torch.load(weights_file_critic)
    except FileNotFoundError:
        print('Error: File \'{}\' not found'.format(weights_file_critic))
        sys.exit(1)

    agent.actor_local.load_state_dict(checkpoint_actor)
    agent.critic_local.load_state_dict(checkpoint_critic)
    print('Running {} episodes'.format(n_episodes))
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=False)[brain_name]
        current_scores = np.zeros(
            num_agents)  # initialize the score for all the agents
        states = env_info.vector_observations
        for t in range(max_t):
            actions = agent.act(states, add_noise=False)
            env_info = env.step(actions)[brain_name]
            states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            current_scores += rewards
            if np.any(dones):
                break
        max_score = np.max(
            current_scores)  # current maximum score of all the agents
        scores.append(max_score)
        if (i_episode % 100 != 0):
            print('\rEpisode {}\tScore: {:.3f}\tAverage Score: {:.3f}'.format(
                i_episode, max_score, np.mean(scores)),
                  end="")
        else:
            print('\rEpisode {}\tScore: {:.3f}\tAverage Score: {:.3f}'.format(
                i_episode, max_score, np.mean(scores)))

    env.close()
# print(action_size,state_size)

import torch
if condition[0] == "random":
    pass
else:
    agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth'))
    agent.critic_local.load_state_dict(torch.load('checkpoint_critic.pth'))
    agent.actor_target.load_state_dict(
        torch.load('checkpoint_actor_target.pth'))
    agent.critic_target.load_state_dict(
        torch.load('checkpoint_critic_target.pth'))

env_info = env.reset(train_mode=False)[brain_name]  # reset the environment
states = env_info.vector_observations  # get the current state (for each agent)
scores = np.zeros(num_agents)  # initialize the score (for each agent)
while True:
    actions = [agent.act(states[no_agent, :]) for no_agent in range(20)]
    actions = np.array(actions).reshape(20, 4)
    actions = np.clip(actions, -1, 1)  # all actions between -1 and 1
    env_info = env.step(actions)[
        brain_name]  # send all actions to tne environment
    next_states = env_info.vector_observations  # get next state (for each agent)
    rewards = env_info.rewards  # get reward (for each agent)
    dones = env_info.local_done  # see if episode finished
    scores += env_info.rewards  # update the score (for each agent)
    states = next_states  # roll over states to next time step
    if np.any(dones):  # exit loop if episode finished
        break
print('Total score (averaged over agents) this episode: {}'.format(
    np.mean(scores)))
Пример #21
0
def ddpg(model_number,
         UPD,
         BUFFER_SIZE,
         BATCH_SIZE,
         LR_ACTOR,
         LR_CRITIC,
         fc1_units,
         fc2_units,
         a_gradient_clipping,
         a_leaky,
         a_dropout,
         c_gradient_clipping,
         c_batch_norm,
         c_leaky,
         c_dropout,
         n_episodes=400,
         max_t=2000,
         print_every=100):
    """Deep Q-Learning.

    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        ...
    """

    agent = Agent(state_size, action_size, UPD, BUFFER_SIZE, BATCH_SIZE,
                  LR_ACTOR, LR_CRITIC, fc1_units, fc2_units,
                  a_gradient_clipping, a_leaky, a_dropout, c_gradient_clipping,
                  c_batch_norm, c_leaky, c_dropout, 0, 12345)
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=print_every)  # last 100 scores

    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]
        agent.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state, a_dropout, a_leaky)

            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]

            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward

            if done:
                break

        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score

        with open('results.txt', 'a') as output:
            output.writelines(\
            '{}, {}, {:.2f}, {:.2f}, {}, {}, {}, {:.4f}, {:.4f}, {}, {}, {}, {}, {}, {}, {}, {}, {} \n'.format(
            model_number, i_episode, np.mean(scores_window), score,
            UPD, BUFFER_SIZE, BATCH_SIZE,
            LR_ACTOR, LR_CRITIC,
            fc1_units, fc2_units,
            a_gradient_clipping, a_leaky, a_dropout,
            c_gradient_clipping, c_batch_norm, c_leaky, c_dropout))
            output.flush()

        print('\rModel nr: {}, Episode {}, avg. score: {:.2f}, score: {:.2f}'.format\
              (model_number, i_episode, np.mean(scores_window), score), end="")
        if i_episode % print_every == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:2f}'.format(
                i_episode, np.mean(scores_window), score))
        if np.mean(scores_window) >= 30.0:
            with open('./models/models_solved.txt', 'a') as solved:
                solved.writelines('{}, {} \n'.format(model_number, i_episode))
                solved.flush()
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode, np.mean(scores_window)))
            torch.save(
                agent.actor_local.state_dict(),
                './models/checkpoint_actor_' + str(model_number) + '.pth')
            torch.save(
                agent.critic_local.state_dict(),
                './models/checkpoint_critic_' + str(model_number) + '.pth')
            break
    return scores
Пример #22
0
# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

##################################
########### LOAD AGENT ###########
##################################

agent = Agent(state_size=state_size, action_size=action_size, num_agents=num_agents, seed=1)
agent.load_model(path_to_actor, path_to_critic)

env_info = env.reset(train_mode=False)[brain_name]  # reset the environment
states = env_info.vector_observations  # get the current state
score = np.zeros(num_agents)  # initialize the score
while True:
    actions = agent.act(states)  # select an action
    env_info = env.step(actions)[brain_name]  # send the action to the environment
    next_states = env_info.vector_observations  # get the next state
    rewards = env_info.rewards  # get the reward
    dones = env_info.local_done  # see if episode has finished
    score += rewards  # update the score
    states = next_states  # roll over the state to next time step
    if np.any(dones):
        break

print("Score: {}".format(score))
env.close()
Пример #23
0
def train_or_play(cfg):
    # initialize the environment and obtain state/action sizes and other parameters
    env = init_environment(cfg.app_path)

    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    env_info = env.reset(train_mode=True)[brain_name]

    action_size = brain.vector_action_space_size
    state_size = len(env_info.vector_observations[0])

    agent = Agent(state_size, action_size, cfg)

    if cfg.train_model:

        scores = ddpg_learning(env,
                               agent,
                               brain_name,
                               n_episodes=cfg.n_episodes,
                               max_t=cfg.max_t,
                               avg_score_cutoff=cfg.avg_score_cutoff,
                               save_path_actor=cfg.save_path_actor,
                               save_path_critic=cfg.save_path_critic)

        if cfg.save_scores:
            print("Saving scores to file {:s}".format(cfg.save_scores))
            scores.to_hdf(cfg.save_scores, "scores")

        plot_scores(scores, cfg)

    else:  # visualize trained model and scores

        assert os.path.exists(
            cfg.save_path_actor
        ), "Saved model weights need to exist before you can watch a trained agent!"
        assert os.path.exists(
            cfg.save_path_critic
        ), "Saved model weights need to exist before you can watch a trained agent!"

        print("Visualizing the trained agent!")

        env_info = env.reset(train_mode=False)[brain_name]
        agent.actor_local.load_state_dict(torch.load(cfg.save_path_actor))
        agent.critic_local.load_state_dict(torch.load(cfg.save_path_critic))

        score = 0  # initialize the score
        state = env_info.vector_observations[0]
        while True:
            action = agent.act(state,
                               add_noise=False)  # take step without noise
            env_info = env.step(action)[brain_name]
            state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            score += reward
            if done:
                break

        if os.path.exists(cfg.save_scores):
            plot_scores(pd.read_hdf(cfg.save_scores, "scores"), cfg)

    env.close()
Пример #24
0
agent_2.critic_target = agent_1.critic_target
t_max = 1000
print_every = 100
maxlen = 100

score = []
ev_score = []
scores_deque = deque(maxlen=maxlen)
for i_episode in range(1, env.n_episodes + 1):  # play game for 5 episodes
    env_info = env.reset(train_mode=True)[brain_name]  # reset the environment
    states = env_info.vector_observations  # get the current state (for each agent)
    scores = np.zeros(num_agents)  # initialize the score (for each agent)
    agent_1.reset()
    agent_2.reset()
    for t in range(t_max):
        actions_1 = agent_1.act(np.expand_dims(states[0], 0), True)
        actions_2 = agent_2.act(np.expand_dims(states[1], 0), True)
        # actions_1 = np.clip(actions_1, -1, 1)             # all actions between -1 and 1
        actions = np.concatenate((actions_1, actions_2))
        env_info = env.step(actions)[
            brain_name]  # send all actions to tne environment

        next_states, rewards, dones = env_info.vector_observations, env_info.rewards, env_info.local_done
        # for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
        agent_1.step(np.expand_dims(states[0], 0), actions_1, rewards[0],
                     np.expand_dims(next_states[0], 0), dones[0], t)
        agent_2.step(np.expand_dims(states[1], 0), actions_2, rewards[1],
                     np.expand_dims(next_states[1], 0), dones[1], t)

        scores += rewards  # update the score (for each agent)
        states = next_states  # roll over states to next time step
Пример #25
0
# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(
    states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

agent = Agent(num_agents=num_agents,
              state_size=state_size,
              action_size=action_size,
              random_seed=0)
# load the weights from file
agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth'))
agent.critic_local.load_state_dict(torch.load('checkpoint_critic.pth'))

env_info = env.reset(train_mode=False)[brain_name]  # reset the environment
states = env_info.vector_observations  # get the current state (for each agent)
scores = np.zeros(num_agents)
for t in range(1000):
    actions = agent.act(states, add_noise=False)
    env_info = env.step(
        actions
    )[brain_name]  # send the action to the environment                rewards = env_info.rewards                   # get the reward
    dones = env_info.local_done  # see if episode has finished
    rewards = env_info.rewards  # get the reward
    states = env_info.vector_observations  # get the next state
    scores += rewards
    if np.any(dones):
        break
print("average score for the episode is", np.max(scores))
env.close()
Пример #26
0
def train(
    n_episodes,
    max_t,
    env_fp,
    no_graphics,
    seed,
    save_every_nth,
    buffer_size,
    batch_size,
    gamma,
    tau,
    lr_actor,
    lr_critic,
    weight_decay,
    log,
):
    log.info("#### Initializing environment...")
    # init environment
    env = UnityEnvironment(file_name=env_fp, no_graphics=no_graphics)

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    log.info(f"Number of agents: {num_agents}")

    # size of each action
    action_size = brain.vector_action_space_size
    log.info(f"Size of each action: {action_size}")

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    log.info(
        f"There are {states.shape[0]} agents. Each observes a state with length: {state_size}"
    )
    log.info(f"The state for the first agent looks like: {states[0]}")

    agent = Agent(
        num_agents=len(env_info.agents),
        state_size=state_size,
        action_size=action_size,
        buffer_size=buffer_size,
        batch_size=batch_size,
        gamma=gamma,
        tau=tau,
        lr_actor=lr_actor,
        lr_critic=lr_critic,
        weight_decay=weight_decay,
        random_seed=seed,
    )

    log.info("#### Training...")

    scores_deque = deque(maxlen=100)
    scores = []
    for i_episode in range(1, n_episodes + 1):
        brain_name = env.brain_names[0]
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        agent.reset()
        score = np.zeros((len(env_info.agents), 1))
        for t in range(max_t):
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            rewards = np.array(rewards).reshape((next_states.shape[0], 1))
            dones = env_info.local_done
            dones = np.array(dones).reshape((next_states.shape[0], 1))
            agent.step(states, actions, rewards, next_states, dones)
            score += rewards
            states = next_states
            if np.any(dones):
                break
        scores_deque.append(np.mean(score))
        scores.append(np.mean(score))
        print(
            "Episode {}\tAverage Score: {:.2f}\tScore: {:.2f}".format(
                i_episode, np.mean(scores_deque), scores[-1]),
            end="\r",
        )

        if i_episode % 100 == 0:
            print("\rEpisode {}\tAverage Score: {:.2f}".format(
                i_episode, np.mean(scores_deque)))
        if i_episode % save_every_nth == 0:
            save_checkpoint(
                state={
                    "episode": i_episode,
                    "actor_state_dict": agent.actor_local.state_dict(),
                    "critic_state_dict": agent.critic_local.state_dict(),
                    "scores_deque": scores_deque,
                    "scores": scores,
                },
                filename="checkpoint.pth",
            )
            plot_scores(
                scores=scores,
                title=f"Avg score over {len(env_info.agents)} agents",
                fname="avg_scores.png",
                savefig=True,
            )

        if np.mean(scores_deque) >= 30:
            torch.save(agent.actor_local.state_dict(), "checkpoint_actor.pth")
            torch.save(agent.critic_local.state_dict(),
                       "checkpoint_critic.pth")
            print(
                "\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}"
                .format(i_episode - 100, np.mean(scores_deque)))
            break
Пример #27
0
    action = 0
    state_size = len(state)
    #

    b_agent = Agent(args.model_name, state_size, action_size)
    try:
        b_agent.load()  # try to load to continue training
    except:
        pass

    for epx in range(1, args.episodes + 1):
        at_step = 0
        env_info = env.reset(train_mode=False)[brain_name]
        b_agent.reset_episode()
        while True:
            action = b_agent.act(state)
            env_info = env.step(action)[brain_name]
            at_step += 1
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            if at_step % 100 == 0:
                log.info("ep:{} step:{} r:{} l:{}".format(
                    epx, at_step, b_agent.cum_rewards(), b_agent.ave_loss()))
            if done:
                break
            b_agent.sense(state, action, reward, next_state, done)
            state = next_state
        print("{},{}".format(epx, b_agent.cum_rewards()))
        b_agent.save()
Пример #28
0
def ddpg(n_episodes=500,
         max_t=1000,
         start_steps=10,
         learn_frequency=20,
         learn_count=10,
         random_seed=1):
    """Deep Deterministic Policy Gradient (DDPG)

    Params
    ======
        n_episodes (int)      : maximum number of training episodes
        max_t (int)           : maximum number of timesteps per episode
        start_steps (int)     : number of starting steps actions are chosen randomly
        learn_frequency (int) : frequency of learning per timestep
        learn_count (int)     : number of learning steps to do at learning timestep
        random_seed (int)     : random seed for agent's weights
    """

    agent = Agent(state_size=state_size,
                  action_size=action_size,
                  random_seed=random_seed)  #Initialize the Agent

    avg_scores_episode = []  # list containing scores from each episode
    avg_scores_moving = [
    ]  # list containing avg scores from window at each episode
    scores_window = deque(maxlen=100)  # last 100 scores

    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]  # reset environment
        states = env_info.vector_observations  # get current state for each agent
        scores = np.zeros(num_agents)  # initialize score for each agent
        agent.reset()  # reset noise of the agent

        for t in range(max_t):
            #Randomly sample actions during the starting steps
            if i_episode <= start_steps:
                actions = np.random.randn(
                    num_agents, action_size)  # select an action randomly
                actions = np.clip(actions, -1,
                                  1)  # all actions between -1 and 1
            else:
                actions = agent.act(
                    states, add_noise=True
                )  # select an action according to policy (for each agent)
            env_info = env.step(actions)[
                brain_name]  # send actions to environment
            next_states = env_info.vector_observations  # get next state (for each agent)
            rewards = env_info.rewards  # get reward (for each agent)
            dones = env_info.local_done  # see if episode has finished (for each agent)

            # for each agent's experience, save it and learn
            for state, action, reward, next_state, done in zip(
                    states, actions, rewards, next_states, dones):
                if t % learn_frequency == 0:  # Learn with frequency
                    agent.step(state,
                               action,
                               reward,
                               next_state,
                               done,
                               learn=True,
                               learn_count=learn_count)
                else:
                    agent.step(state,
                               action,
                               reward,
                               next_state,
                               done,
                               learn=False)  #just add, don't learn

            states = next_states

            scores += rewards  # add the rewards from the timestep to the scores
            if np.any(
                    dones
            ):  # finish episode if any agent has reached a terminal state
                break

        scores_window.append(
            np.mean(scores))  # save the most recent score to scores window

        avg_scores_episode.append(
            np.mean(scores))  # save the most recent score to avg_scores
        avg_scores_moving.append(
            np.mean(scores_window)
        )  # save the most recent score window average to moving averages

        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
            i_episode, np.mean(scores_window)),
              end="")
        if i_episode % 1 == 0:  # Print every episode
            print(
                '\rEpisode {}\tAverage Score: {:.2f} \t Current Score: {:.2f}'.
                format(i_episode, np.mean(scores_window), np.mean(scores)))

        #environment is solved
        if np.mean(scores_window) >= 30.0:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode - 100, np.mean(scores_window)))
            torch.save(agent.actor_local.state_dict(),
                       "checkpoint_actor.pth")  #Save actors' weights
            torch.save(agent.critic_local.state_dict(),
                       "checkpoint_critic.pth")  #Save critics' weights
            break

    return avg_scores_episode, avg_scores_moving  # Return average score of each episode and moving average at that time
def multi_agent_ddpg(env, brain_name, title, n_episodes, action_size,
                     state_size, num_agents, print_every, n_updates,
                     update_intervals, device):

    # create save dir for this experiment
    if title is None:
        title = "experiment"
    current_time = strftime("%Y-%m-%d_%H:%M:%S", gmtime())
    title = title + "_" + current_time

    # write a new file
    os.makedirs("experiments/{}".format(title), exist_ok=True)
    f = open("experiments/{}/scores.txt".format(title), "w")
    f.close()

    all_agents_statesize = state_size * num_agents

    agent1 = Agent(state_size=all_agents_statesize,
                   action_size=action_size,
                   num_agents=1,
                   random_seed=123,
                   device=device)
    agent2 = Agent(state_size=all_agents_statesize,
                   action_size=action_size,
                   num_agents=1,
                   random_seed=123,
                   device=device)

    scores_deque = deque(maxlen=100)
    mean_scores = []

    for i_episode in range(1, n_episodes + 1):

        env_info = env.reset(train_mode=True)[brain_name]

        states = env_info.vector_observations
        states = np.reshape(
            states,
            (1, all_agents_statesize
             ))  # reshape so we can feed both agents states to each agent

        # reset
        agent1.reset()
        agent2.reset()

        # place to store scores
        agent_scores = np.zeros(num_agents)
        t = 0
        while True:
            # two agents actions
            actions_1 = agent1.act(states, add_noise=True)
            actions_2 = agent2.act(states, add_noise=True)

            # step environment for two agents and get next states
            actions = np.concatenate((actions_1, actions_2), axis=0)
            actions = np.reshape(actions, (1, 4))
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            next_states = np.reshape(next_states, (1, all_agents_statesize))
            rewards = env_info.rewards
            dones = env_info.local_done

            # update the agents accordingly (ddpg)
            agent1.step(states, actions_1, rewards[0], next_states, dones[0],
                        n_updates, update_intervals, t)
            agent2.step(states, actions_2, rewards[1], next_states, dones[1],
                        n_updates, update_intervals, t)

            states = next_states
            agent_scores += rewards

            if np.any(dones):
                break
            t += 1

        scores_deque.append(np.max(agent_scores))
        print('\rEpisode {}\tLast 100 average Score: {:.2f}'.format(
            i_episode, np.mean(scores_deque)),
              end="")

        # save score and model every print_every
        if i_episode % print_every == 0:
            f = open("experiments/{}/scores.txt".format(title), "a")
            f.write("{},{}\n".format(i_episode, np.mean(scores_deque)))
            f.close()
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_deque)))
            mean_scores.append(np.mean(scores_deque))
            # save if best model
            if np.mean(scores_deque) == max(mean_scores):
                torch.save(
                    agent1.actor_local.state_dict(),
                    'experiments/{}/checkpoint_actor1.pth'.format(title))
                torch.save(
                    agent1.critic_local.state_dict(),
                    'experiments/{}/checkpoint_critic1.pth'.format(title))
                torch.save(
                    agent2.actor_local.state_dict(),
                    'experiments/{}/checkpoint_actor2.pth'.format(title))
                torch.save(
                    agent2.critic_local.state_dict(),
                    'experiments/{}/checkpoint_critic2.pth'.format(title))

            if np.mean(scores_deque) >= 1.0 and i_episode > 100:
                print("\rEnvironment solved with average score of 30")
                break
Пример #30
0
#            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
#
#  return scores

#scores = ddpg()
n_episodes = 15
max_t = 300
print_every = 100
scores_deque = deque(maxlen=print_every)
scores = []
for i_episode in range(1, n_episodes + 1):
    state = env.reset()
    agent.reset()
    score = 0
    for t in range(max_t):
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        agent.step(state, action, reward, next_state, done)
        state = next_state
        score += reward

        if done:
            break
    scores_deque.append(score)
    scores.append(score)

    print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode,
                                                       np.mean(scores_deque)),
          end="")
    torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
    torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')