agent = Agent(alpha=0.000025, beta = 0.00025, input_dims=[8], tau=0.001, env=env, batch_size=64, layer1_size=400, layer2_size=300, n_actions=2) episodes = 1000 np.random.seed(42) tau_hist = [] score_hist = [] for i in range(episodes): done = False score = 0 state = env.reset() while not done: act = agent.choose_action(state) next_state, reward, done, _ = env.step(act) agent.store(state, act, reward, next_state, int(done)) agent.learn() score += reward state = next_state agent.save_models() score_hist.append(score) tau_hist.append(agent.tau) avg_score = np.mean(score_hist[-100:]) print('episode ' + str(i + 1) + 'score %.2f' % score + 'average score %.2f' % avg_score) episodes = np.arange(1, episodes + 1) plot_curve(episodes, score_hist, tau_hist)
class MADDPG(): def __init__(self, state_size, action_size, random_seed): """Initialize 2 Agent objects. Params ====== state_size (int): dimension of one agent's observation action_size (int): dimension of each action """ self.state_size = state_size self.action_size = action_size # Initialize the agents self.ddpg_agent0 = Agent(state_size, action_size, random_seed=0) self.ddpg_agent1 = Agent(state_size, action_size, random_seed=1) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def act(self, states, rand=False): """Agents act with actor_local""" if rand == False: action0 = self.ddpg_agent0.act(states[0]) action1 = self.ddpg_agent1.act(states[1]) actions = [action0, action1] return actions if rand == True: actions = np.random.randn(2, 2) actions = np.clip(actions, -1, 1) return actions def step(self, states, actions, rewards, next_states, dones, learn=True): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward state0 = states[0] state1 = states[1] action0 = actions[0] action1 = actions[1] reward0 = rewards[0] reward1 = rewards[1] next_state0 = next_states[0] next_state1 = next_states[1] done0 = dones[0] done1 = dones[1] self.memory.add(state0, state1, action0, action1, reward0, reward1, next_state0, next_state1, done0, done1) if learn == True and len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def learn(self, experiences, GAMMA): s0, s1, a0, a1, r0, r1, next_s0, next_s1, d0, d1 = experiences # next actions (for CRITIC network) a_next0 = self.ddpg_agent0.actor_target(next_s0) a_next1 = self.ddpg_agent1.actor_target(next_s1) # action predictions (for ACTOR network) a_pred0 = self.ddpg_agent0.actor_local(s0) a_pred1 = self.ddpg_agent1.actor_local(s1) # ddpg agents learn separately, each agent learns from its perspective, that is why states, actions, etc are swapped self.ddpg_agent0.learn(s0, s1, a0, a1, r0, r1, next_s0, next_s1, d0, d1, a_next0, a_next1, a_pred0, a_pred1) self.ddpg_agent1.learn(s1, s0, a1, a0, r1, r0, next_s1, next_s0, d1, d0, a_next1, a_next0, a_pred1, a_pred0)