예제 #1
0
def main(arglist):
    ACTORS = 1
    env = EnvWrapper(arglist.scenario, ACTORS, arglist.saved_episode)
    if arglist.eval:
        current_time = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
        writer = SummaryWriter(log_dir='./logs/' + current_time + '-' +
                               arglist.scenario)
    maddpg_wrapper = MADDPG(ACTORS)

    maddpg_wrapper.create_agents(env, arglist)

    j = 0
    for episode in range(arglist.max_episode):
        obs = env.reset()
        terminal = False
        maddpg_wrapper.reset()
        total_reward = [0 for i in maddpg_wrapper.workers]
        step = 0

        while not terminal and step < 25:
            if not arglist.eval:
                env.render(0)
                time.sleep(0.03)

            actions = maddpg_wrapper.take_actions(obs)
            obs2, reward, done = env.step(actions)

            for actor in range(ACTORS):
                for i, rew in enumerate(reward[actor]):
                    total_reward[i] += rew

            j += ACTORS
            #terminal = all(done)
            if arglist.eval:
                maddpg_wrapper.update(j, ACTORS, actions, reward, obs, obs2,
                                      done)

            obs = obs2
            step += 1

        if arglist.eval and episode % arglist.saved_episode == 0 and episode > 0:
            maddpg_wrapper.save(episode)

        if arglist.eval:
            for worker, ep_ave_max in zip(maddpg_wrapper.workers,
                                          maddpg_wrapper.ep_ave_max_q_value):
                print(worker.pos, ' => average_max_q: ',
                      ep_ave_max / float(step), ' Reward: ',
                      total_reward[worker.pos], ' Episode: ', episode)
                writer.add_scalar(
                    str(worker.pos) + '/Average_max_q',
                    ep_ave_max / float(step), episode)
                writer.add_scalar(
                    str(worker.pos) + '/Reward Agent',
                    total_reward[worker.pos], episode)

    env.close()
예제 #2
0
def trainFunction(state_size, action_size, n_episodes=4000, num_agents=2):
    magent = MADDPG(action_size=action_size,
                    noise_start=1.0,
                    seed=2,
                    gamma=0.99,
                    t_stop_noise=30000)
    scores = []
    scores_deque = deque(maxlen=100)
    scores_avg = []

    for i_episode in range(1, n_episodes + 1):
        rewards = []
        env_info = env.reset(
            train_mode=True)[brain_name]  # reset the environment
        states = env_info.vector_observations  # get the current state (for each agent)
        if i_episode % 2:
            update = True
        # loop over steps
        while True:
            # select an action
            joint_actions = magent.act(states, update)
            update = False
            # take action in environment and set parameters to new values
            env_info = env.step(joint_actions)[brain_name]
            next_states = env_info.vector_observations
            rewards_v = env_info.rewards
            done_v = env_info.local_done
            # update and train agent with returned information
            magent.step(states, joint_actions, rewards_v, next_states, done_v)
            states = next_states
            rewards.append(rewards_v)
            if any(done_v):
                break

        # calculate episode reward as maximum of individually collected rewards of agents
        episode_reward = np.max(np.sum(np.array(rewards), axis=0))

        scores.append(
            episode_reward)  # save most recent score to overall score array
        scores_deque.append(
            episode_reward
        )  # save most recent score to running window of 100 last scores
        current_avg_score = np.mean(scores_deque)
        scores_avg.append(
            current_avg_score
        )  # save average of last 100 scores to average score array

        print('\rEpisode {}\tAverage Score: {:.3f}'.format(
            i_episode, current_avg_score),
              end="")

        # log average score every 200 episodes
        if i_episode % 200 == 0:
            print('\rEpisode {}\tAverage Score: {:.3f}'.format(
                i_episode, current_avg_score))

        # break and report success if environment is solved
        if np.mean(scores_deque) >= .5 and i_episode % 200 == 0:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'
                .format(i_episode, np.mean(scores_deque)))
            magent.save()
예제 #3
0
def update():
    if ALGORITHM == 'maddpg':
        ddpg = MADDPG(avs.n_actions, avs.n_features, 1, 'maddpg model',
                      RETRAIN)
    elif ALGORITHM == 'ddpg':
        ddpg = DDPG(avs.n_actions, avs.n_features, 1, 'ddpg model', RETRAIN)
    else:
        ddpg = DDPG(avs.n_actions, avs.n_features, 1, 'ddpg model', RETRAIN)
    t1 = time.time()
    rewards1 = 0
    rewards2 = 0
    var = VAR
    collision = 0
    avgreward1 = []
    avgreward2 = []
    collision_percentage = []
    for i in range(MAX_EPISODES):
        s1, s2 = avs.reset()
        ep_reward1 = 0
        ep_reward2 = 0
        if i % 100000 == 0 and i > IMITATION_EPISODE:
            plot(avgreward1, avgreward2, collision_percentage, i)
        for j in range(MAX_EP_STEPS):
            if RENDER:
                avs.render()

            # Add exploration noise
            if i < IMITATION_EPISODE or i % 4 == 0:
                a1 = imitation(avs.agent1, avs.agent2, avs.target1)
                a2 = imitation(avs.agent2, avs.agent1, avs.target2)
            else:
                # add randomness to action selection for exploration
                a1 = ddpg.choose_action(s1)
                a1 = [
                    np.clip(np.random.normal(a1[0], var), -1, 1),
                    np.clip(np.random.normal(a1[1], var), -1, 1)
                ]
                a2 = ddpg.choose_action(s2)
                a2 = [
                    np.clip(np.random.normal(a2[0], var), -1, 1),
                    np.clip(np.random.normal(a2[1], var), -1, 1)
                ]
                # a2 = imitation(avs.agent2, avs.agent1, avs.target2)

            if DEBUG:
                time.sleep(0.1)
            s_1, r1, s_2, r2, done, info = avs.step(a1, a2)
            if ALGORITHM == 'ddpg':
                ddpg.store_transition(s1, a1, r1, s_1)
                ddpg.store_transition(s2, a2, r2, s_2)
            else:
                ddpg.store_transition(s1, s2, a1, a2, r1, s_1, s_2)
                ddpg.store_transition(s2, s1, a2, a1, r2, s_2, s_1)

            s1 = s_1
            s2 = s_2
            ep_reward1 += r1
            ep_reward2 += r2

            if j == MAX_EP_STEPS - 1 or done:
                print("pt:", ddpg.pointer)
                print('Episode:', i,
                      'Step:', j, ' Reward: %i' % int(ep_reward1),
                      int(ep_reward2), 'Explore: %.2f' % var)

                if i >= IMITATION_EPISODE:
                    rewards1 += ep_reward1
                    rewards2 += ep_reward2
                    if r1 < -100:
                        collision += 1
                    if (i + 1) % 100 == 0:
                        avgreward1.append(rewards1 / 100)
                        avgreward2.append(rewards2 / 100)
                        collision_percentage.append(collision)
                        rewards1 = 0
                        rewards2 = 0
                        collision = 0
                break
        if ddpg.pointer > MEMORY_CAPACITY:
            ddpg.learn()
            ddpg.learn()
            if var > MIN_VAR and i > IMITATION_EPISODE:
                var *= DECAY  # decay the action randomness
        if i % 4 != 0 and ep_reward1 > 100 and ep_reward2 > 100 and i > IMITATION_EPISODE:
            ddpg.save(i)
    print('Running time: ', time.time() - t1)
예제 #4
0
]
# handle invalid dir char
for i in range(len(model_names)):
    model_names[i] = model_names[i].replace('[', '').replace(']', '').replace(
        ' ', '').replace(',', '_')
# handle standard arg, i.e., {}
model_names = ['standard' if name == '' else name for name in model_names]

# model loop
for i in trange(len(args), desc='model', leave=True):
    model_dir = '{}/{}'.format(root, model_names[i])
    os.mkdir(model_dir)
    # log cmd
    with open('{}/cmd_config.txt'.format(model_dir), 'w') as f:
        for k, v in control_args.items():
            f.write(str(k) + ': ' + str(v) + '\n')
    arg = args[i]
    # repeat loop
    for n in trange(control_args['repeat'], desc='repeat', leave=True):
        dir = '{}/{}'.format(model_dir, n)
        os.mkdir(dir)
        maddpg = MADDPG(env, **arg)
        if control_args.has_key('load'):
            model_path = control_args['load']
            maddpg.load_actor(model_path)
            maddpg.load_critic(model_path)
        if control_args['train']:
            maddpg.train(dir, control_args['save_interval'])
            maddpg.save(dir)
        maddpg.test(dir, n=control_args['n_test'])