def update(): if ALGORITHM == 'maddpg': ddpg = MADDPG(avs.n_actions, avs.n_features, 1, 'maddpg model', RETRAIN) elif ALGORITHM == 'ddpg': ddpg = DDPG(avs.n_actions, avs.n_features, 1, 'ddpg model', RETRAIN) else: ddpg = DDPG(avs.n_actions, avs.n_features, 1, 'ddpg model', RETRAIN) t1 = time.time() rewards1 = 0 rewards2 = 0 var = VAR collision = 0 avgreward1 = [] avgreward2 = [] collision_percentage = [] for i in range(MAX_EPISODES): s1, s2 = avs.reset() ep_reward1 = 0 ep_reward2 = 0 if i % 100000 == 0 and i > IMITATION_EPISODE: plot(avgreward1, avgreward2, collision_percentage, i) for j in range(MAX_EP_STEPS): if RENDER: avs.render() # Add exploration noise if i < IMITATION_EPISODE or i % 4 == 0: a1 = imitation(avs.agent1, avs.agent2, avs.target1) a2 = imitation(avs.agent2, avs.agent1, avs.target2) else: # add randomness to action selection for exploration a1 = ddpg.choose_action(s1) a1 = [ np.clip(np.random.normal(a1[0], var), -1, 1), np.clip(np.random.normal(a1[1], var), -1, 1) ] a2 = ddpg.choose_action(s2) a2 = [ np.clip(np.random.normal(a2[0], var), -1, 1), np.clip(np.random.normal(a2[1], var), -1, 1) ] # a2 = imitation(avs.agent2, avs.agent1, avs.target2) if DEBUG: time.sleep(0.1) s_1, r1, s_2, r2, done, info = avs.step(a1, a2) if ALGORITHM == 'ddpg': ddpg.store_transition(s1, a1, r1, s_1) ddpg.store_transition(s2, a2, r2, s_2) else: ddpg.store_transition(s1, s2, a1, a2, r1, s_1, s_2) ddpg.store_transition(s2, s1, a2, a1, r2, s_2, s_1) s1 = s_1 s2 = s_2 ep_reward1 += r1 ep_reward2 += r2 if j == MAX_EP_STEPS - 1 or done: print("pt:", ddpg.pointer) print('Episode:', i, 'Step:', j, ' Reward: %i' % int(ep_reward1), int(ep_reward2), 'Explore: %.2f' % var) if i >= IMITATION_EPISODE: rewards1 += ep_reward1 rewards2 += ep_reward2 if r1 < -100: collision += 1 if (i + 1) % 100 == 0: avgreward1.append(rewards1 / 100) avgreward2.append(rewards2 / 100) collision_percentage.append(collision) rewards1 = 0 rewards2 = 0 collision = 0 break if ddpg.pointer > MEMORY_CAPACITY: ddpg.learn() ddpg.learn() if var > MIN_VAR and i > IMITATION_EPISODE: var *= DECAY # decay the action randomness if i % 4 != 0 and ep_reward1 > 100 and ep_reward2 > 100 and i > IMITATION_EPISODE: ddpg.save(i) print('Running time: ', time.time() - t1)
evaluate = False best_score = 0 if evaluate: maddpg_agents.load_checkpoint() for i in range(N_GAMES): obs = env.reset() score = 0 done = [False] * n_agents episode_step = 0 while not any(done): if evaluate: env.render() #time.sleep(0.1) # to slow down the action for the video actions = maddpg_agents.choose_action(obs) obs_, reward, done, info = env.step(actions) state = obs_list_to_state_vector(obs) state_ = obs_list_to_state_vector(obs_) if episode_step >= MAX_STEPS: done = [True] * n_agents memory.store_transition(obs, state, actions, reward, obs_, state_, done) if total_steps % 100 == 0 and not evaluate: maddpg_agents.learn(memory) obs = obs_