p.append(out[i][0]) next_obs, reward, terminated, info = env.step(np.hstack(p)) setps += 1 ep_len += 1 for i in range(n_ant): buff[i].add(obs, p[i], reward, next_obs, terminated) obs = next_obs if (terminated) | (ep_len == max_ep_len): obs = env.reset() terminated = False ep_len = 0 if setps % 10000 == 0: print(test_agent()) if (setps < 1000) | (setps % 50 != 0): continue for e in range(50): for i in range(n_ant): X[i], A[i], R[i], next_X[i], D[i] = buff[i].getBatch(batch_size) q_e = agents.compute_target([next_X[i] for i in range(n_ant)]) for i in range(n_ant): Q_target[i] = R[i] + (q_e[i] - alpha * q_e[i + n_ant]) * gamma * (1 - D[i]) agents.train_critics(X, A, Q_target) agents.train_actors(X) agents.update()
sum_reward += reward setps += 1 buff.add(obs, p, reward, next_obs, terminated) obs = next_obs if terminated: obs = env.reset() terminated = False reward_list.append(sum_reward) sum_reward = 0 if buff.pointer > buffer_size: print(np.mean(reward_list)) reward_list = [] for k in range(num_ite): states, actions, returns, next_states, dones, gammas = buff.getBatch(mini_batch) Q_target = agents.compute_target([next_states])[0] Q_target = returns + Q_target*gammas*(1 - dones) agents.train_critic(states, actions, Q_target) agents.update() states, actions, returns, next_states, dones, gammas = buff.getBatch(2000) advantages = agents.compute_advantage([states]+[actions[i] for i in range(n_ant)]) if advantage_norm: for i in range(n_ant): advantages[i] = (advantages[i] - advantages[i].mean())/(advantages[i].std()+1e-8) agents.train_actors(states, actions, advantages) buff.reset()