예제 #1
0
def update():
    if ALGORITHM == 'maddpg':
        ddpg = MADDPG(avs.n_actions, avs.n_features, 1, 'maddpg model',
                      RETRAIN)
    elif ALGORITHM == 'ddpg':
        ddpg = DDPG(avs.n_actions, avs.n_features, 1, 'ddpg model', RETRAIN)
    else:
        ddpg = DDPG(avs.n_actions, avs.n_features, 1, 'ddpg model', RETRAIN)
    t1 = time.time()
    rewards1 = 0
    rewards2 = 0
    var = VAR
    collision = 0
    avgreward1 = []
    avgreward2 = []
    collision_percentage = []
    for i in range(MAX_EPISODES):
        s1, s2 = avs.reset()
        ep_reward1 = 0
        ep_reward2 = 0
        if i % 100000 == 0 and i > IMITATION_EPISODE:
            plot(avgreward1, avgreward2, collision_percentage, i)
        for j in range(MAX_EP_STEPS):
            if RENDER:
                avs.render()

            # Add exploration noise
            if i < IMITATION_EPISODE or i % 4 == 0:
                a1 = imitation(avs.agent1, avs.agent2, avs.target1)
                a2 = imitation(avs.agent2, avs.agent1, avs.target2)
            else:
                # add randomness to action selection for exploration
                a1 = ddpg.choose_action(s1)
                a1 = [
                    np.clip(np.random.normal(a1[0], var), -1, 1),
                    np.clip(np.random.normal(a1[1], var), -1, 1)
                ]
                a2 = ddpg.choose_action(s2)
                a2 = [
                    np.clip(np.random.normal(a2[0], var), -1, 1),
                    np.clip(np.random.normal(a2[1], var), -1, 1)
                ]
                # a2 = imitation(avs.agent2, avs.agent1, avs.target2)

            if DEBUG:
                time.sleep(0.1)
            s_1, r1, s_2, r2, done, info = avs.step(a1, a2)
            if ALGORITHM == 'ddpg':
                ddpg.store_transition(s1, a1, r1, s_1)
                ddpg.store_transition(s2, a2, r2, s_2)
            else:
                ddpg.store_transition(s1, s2, a1, a2, r1, s_1, s_2)
                ddpg.store_transition(s2, s1, a2, a1, r2, s_2, s_1)

            s1 = s_1
            s2 = s_2
            ep_reward1 += r1
            ep_reward2 += r2

            if j == MAX_EP_STEPS - 1 or done:
                print("pt:", ddpg.pointer)
                print('Episode:', i,
                      'Step:', j, ' Reward: %i' % int(ep_reward1),
                      int(ep_reward2), 'Explore: %.2f' % var)

                if i >= IMITATION_EPISODE:
                    rewards1 += ep_reward1
                    rewards2 += ep_reward2
                    if r1 < -100:
                        collision += 1
                    if (i + 1) % 100 == 0:
                        avgreward1.append(rewards1 / 100)
                        avgreward2.append(rewards2 / 100)
                        collision_percentage.append(collision)
                        rewards1 = 0
                        rewards2 = 0
                        collision = 0
                break
        if ddpg.pointer > MEMORY_CAPACITY:
            ddpg.learn()
            ddpg.learn()
            if var > MIN_VAR and i > IMITATION_EPISODE:
                var *= DECAY  # decay the action randomness
        if i % 4 != 0 and ep_reward1 > 100 and ep_reward2 > 100 and i > IMITATION_EPISODE:
            ddpg.save(i)
    print('Running time: ', time.time() - t1)
예제 #2
0
파일: main.py 프로젝트: fb1n15/maddpg
    evaluate = False
    best_score = 0

    if evaluate:
        maddpg_agents.load_checkpoint()

    for i in range(N_GAMES):
        obs = env.reset()
        score = 0
        done = [False] * n_agents
        episode_step = 0
        while not any(done):
            if evaluate:
                env.render()
                #time.sleep(0.1) # to slow down the action for the video
            actions = maddpg_agents.choose_action(obs)
            obs_, reward, done, info = env.step(actions)

            state = obs_list_to_state_vector(obs)
            state_ = obs_list_to_state_vector(obs_)

            if episode_step >= MAX_STEPS:
                done = [True] * n_agents

            memory.store_transition(obs, state, actions, reward, obs_, state_,
                                    done)

            if total_steps % 100 == 0 and not evaluate:
                maddpg_agents.learn(memory)

            obs = obs_