Exemplo n.º 1
0
def play(episodes, is_render, is_testing, checkpoint_interval,
         weights_filename_prefix, csv_filename_prefix, batch_size):
    # init statistics. NOTE: simple tag specific!
    statistics_header = ["episode"]
    statistics_header.append("steps")
    statistics_header.extend(["reward_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["loss_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["collisions_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["ou_theta_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["ou_mu_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["ou_sigma_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["ou_dt_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["ou_x0_{}".format(i) for i in range(env.n)])
    print("Collecting statistics {}:".format(" ".join(statistics_header)))
    statistics = general_utilities.Time_Series_Statistics_Store(
        statistics_header)

    for episode in range(args.episodes):
        states = env.reset()
        episode_losses = np.zeros(env.n)
        episode_rewards = np.zeros(env.n)
        collision_count = np.zeros(env.n)
        steps = 0
        jsonFile = []
        coords = []
        fullyBreak = False

        while steps < 300:
            steps += 1

            # render
            if args.render:
                env.render()
                time.sleep(0.05)

            if args.dump_file:
                # NIJE STO JE NASE...
                frame = env.dump_file()
                coords.append(frame)
                print("Radim...\n")

            # act
            actions = []
            for i in range(env.n):
                action = np.clip(
                    actors[i].choose_action(states[i]) + actors_noise[i](), -2,
                    2)
                actions.append(action)

            # step
            states_next, rewards, done, info = env.step(actions)

            # learn
            if not args.testing:
                size = memories[0].pointer
                batch = random.sample(
                    range(size), size) if size < batch_size else random.sample(
                        range(size), batch_size)

                for i in range(env.n):
                    if done[i]:
                        rewards[i] -= 500

                    memories[i].remember(states, actions, rewards[i],
                                         states_next, done[i])

                    if memories[i].pointer > batch_size * 10:
                        s, a, r, sn, _ = memories[i].sample(batch, env.n)
                        r = np.reshape(r, (batch_size, 1))
                        loss = critics[i].learn(s, a, r, sn)
                        actors[i].learn(actors, s)
                        episode_losses[i] += loss
                    else:
                        episode_losses[i] = -1

            states = states_next
            episode_rewards += rewards
            collision_count += np.array(
                simple_tag_utilities.count_agent_collisions(env))

            # reset states if done
            if any(done):
                episode_rewards = episode_rewards / steps
                episode_losses = episode_losses / steps

                statistic = [episode]
                statistic.append(steps)
                statistic.extend([episode_rewards[i] for i in range(env.n)])
                statistic.extend([episode_losses[i] for i in range(env.n)])
                statistic.extend(collision_count.tolist())
                statistic.extend([actors_noise[i].theta for i in range(env.n)])
                statistic.extend([actors_noise[i].mu for i in range(env.n)])
                statistic.extend([actors_noise[i].sigma for i in range(env.n)])
                statistic.extend([actors_noise[i].dt for i in range(env.n)])
                statistic.extend([actors_noise[i].x0 for i in range(env.n)])
                statistics.add_statistics(statistic)

                # NIJE STO JE NASE...

                if args.dump_file:
                    fileNum = 1
                    print("Pravim...\n")
                    with open("results/coords.txt", "w+") as f:
                        f.write(str(len(coords[0]) / 2))
                        f.write("\n")
                        for fr in coords:
                            print("Pisem...\n")
                            f.write(" ".join(str(i) for i in fr))
                            f.write("\n")
                    fullyBreak = True
                    break

                #fileNum += 1
                coords = []

                if episode % 25 == 0:
                    print(statistics.summarize_last())
                break

        if episode % checkpoint_interval == 0:
            statistics.dump("{}_{}.csv".format(csv_filename_prefix, episode))
            if not os.path.exists(weights_filename_prefix):
                os.makedirs(weights_filename_prefix)
            save_path = saver.save(session,
                                   os.path.join(weights_filename_prefix,
                                                "models"),
                                   global_step=episode)
            print("saving model to {}".format(save_path))
            if episode >= checkpoint_interval:
                os.remove("{}_{}.csv".format(csv_filename_prefix,
                                             episode - checkpoint_interval))
        if fullyBreak:
            break

    return statistics
Exemplo n.º 2
0
def play(episodes, is_render, is_testing, checkpoint_interval,
         weights_filename_prefix, csv_filename_prefix, batch_size):
    # init statistics. NOTE: simple tag specific!
    statistics_header = ["episode"]
    statistics_header.append("steps")
    statistics_header.extend(["reward_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["loss_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["collisions_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["ou_theta_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["ou_mu_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["ou_sigma_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["ou_dt_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["ou_x0_{}".format(i) for i in range(env.n)])
    print("Collecting statistics {}:".format(" ".join(statistics_header)))
    statistics = general_utilities.Time_Series_Statistics_Store(
        statistics_header)

    for episode in range(args.episodes):
        states = env.reset()
        episode_losses = np.zeros(env.n)
        episode_rewards = np.zeros(env.n)
        collision_count = np.zeros(env.n)
        steps = 0

        while True:
            steps += 1

            # render
            if args.render:
                env.render()

            # act
            actions = []
            for i in range(env.n):
                action = np.clip(
                    actors[i].choose_action(states[i]) + actors_noise[i](), -2,
                    2)
                actions.append(action)

            # step
            states_next, rewards, done, info = env.step(actions)

            # learn
            if not args.testing:
                size = memories[0].pointer
                batch = random.sample(
                    range(size), size) if size < batch_size else random.sample(
                        range(size), batch_size)

                for i in range(env.n):
                    if done[i]:
                        rewards[i] -= 50

                    memories[i].remember(states[i], actions[i], rewards[i],
                                         states_next[i], done[i])

                    if memories[i].pointer > batch_size * 10:
                        s, a, r, sn, _ = memories[i].sample(batch)
                        r = np.reshape(r, (batch_size, 1))
                        loss = critics[i].learn(s, a, r, sn)
                        actors[i].learn(s)
                        episode_losses[i] += loss
                    else:
                        episode_losses[i] = -1

            states = states_next
            episode_rewards += rewards
            collision_count += np.array(
                simple_tag_utilities.count_agent_collisions(env))

            # reset states if done
            if any(done):
                episode_rewards = episode_rewards / steps
                episode_losses = episode_losses / steps

                statistic = [episode]
                statistic.append(steps)
                statistic.extend([episode_rewards[i] for i in range(env.n)])
                statistic.extend([episode_losses[i] for i in range(env.n)])
                statistic.extend(collision_count.tolist())
                statistic.extend([actors_noise[i].theta for i in range(env.n)])
                statistic.extend([actors_noise[i].mu for i in range(env.n)])
                statistic.extend([actors_noise[i].sigma for i in range(env.n)])
                statistic.extend([actors_noise[i].dt for i in range(env.n)])
                statistic.extend([actors_noise[i].x0 for i in range(env.n)])
                statistics.add_statistics(statistic)
                if episode % 25 == 0:
                    print(statistics.summarize_last())
                break

        if episode % checkpoint_interval == 0:
            statistics.dump("{}_{}.csv".format(csv_filename_prefix, episode))
            if not os.path.exists(weights_filename_prefix):
                os.makedirs(weights_filename_prefix)
            save_path = saver.save(session,
                                   os.path.join(weights_filename_prefix,
                                                "models"),
                                   global_step=episode)
            print("saving model to {}".format(save_path))
            if episode >= checkpoint_interval:
                os.remove("{}_{}.csv".format(csv_filename_prefix,
                                             episode - checkpoint_interval))

    return statistics
Exemplo n.º 3
0
def play(episodes, is_render, is_testing, checkpoint_interval,
         weights_filename_prefix, csv_filename_prefix, batch_size):
    # init statistics. NOTE: simple tag specific!
    statistics_header = ["episode"]
    statistics_header.append("steps")
    statistics_header.extend(["reward_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["loss_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["eps_greedy_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["collisions_{}".format(i) for i in range(env.n)])
    print("Collecting statistics {}:".format(" ".join(statistics_header)))
    statistics = general_utilities.Time_Series_Statistics_Store(
        statistics_header)

    for episode in range(args.episodes):
        states = env.reset()
        episode_losses = np.zeros(env.n)
        episode_rewards = np.zeros(env.n)
        collision_count = np.zeros(env.n)
        steps = 0

        while True:
            steps += 1

            # render
            if args.render:
                env.render()
                time.sleep(0.1)

            # act
            actions = []
            actions_onehot = []
            for i in range(env.n):
                action = dqns[i].choose_action(states[i])
                speed = 0.9 if env.agents[i].adversary else 1

                onehot_action = np.zeros(n_actions[i])
                onehot_action[action] = speed
                actions_onehot.append(onehot_action)
                actions.append(action)

            # step
            states_next, rewards, done, info = env.step(actions_onehot)

            # learn
            if not args.testing:
                size = memories[0].pointer
                batch = random.sample(
                    range(size), size) if size < batch_size else random.sample(
                        range(size), batch_size)

                for i in range(env.n):
                    if done[i]:
                        rewards[i] -= 50

                    memories[i].remember(states[i], actions[i], rewards[i],
                                         states_next[i], done[i])

                    if memories[i].pointer > batch_size * 10:
                        history = dqns[i].learn(*memories[i].sample(batch))
                        episode_losses[i] += history.history["loss"][0]
                    else:
                        episode_losses[i] = -1

            states = states_next
            episode_rewards += rewards
            collision_count += np.array(
                simple_tag_utilities.count_agent_collisions(env))

            # reset states if done
            if any(done):
                episode_rewards = episode_rewards / steps
                episode_losses = episode_losses / steps

                statistic = [episode]
                statistic.append(steps)
                statistic.extend([episode_rewards[i] for i in range(env.n)])
                statistic.extend([episode_losses[i] for i in range(env.n)])
                statistic.extend([dqns[i].eps_greedy for i in range(env.n)])
                statistic.extend(collision_count.tolist())
                statistics.add_statistics(statistic)
                if episode % 25 == 0:
                    print(statistics.summarize_last())
                break

        if episode % checkpoint_interval == 0:
            statistics.dump("{}_{}.csv".format(csv_filename_prefix, episode))
            general_utilities.save_dqn_weights(
                dqns, "{}_{}_".format(weights_filename_prefix, episode))
            if episode >= checkpoint_interval:
                os.remove("{}_{}.csv".format(csv_filename_prefix,
                                             episode - checkpoint_interval))

    return statistics
Exemplo n.º 4
0
def play(episodes, is_render, is_testing, checkpoint_interval,
         weights_filename_prefix, csv_filename_prefix, batch_size):
    # init statistics. NOTE: simple tag specific!
    statistics_header = ["episode"]
    statistics_header.append("steps")
    statistics_header.append("done")
    statistics_header.append("reward")
    statistics_header.extend(
        ["loss_{}".format(i) for i in range(env.num_agents)])
    statistics_header.extend(
        ["eps_greedy_{}".format(i) for i in range(env.num_agents)])
    statistics_header.extend(
        ["Agent Energy Left_{}".format(i) for i in range(env.num_agents)])
    statistics_header.extend(
        ["Task Energy Left_{}".format(i) for i in range(env.num_agents)])
    print("Collecting statistics {}:".format(" ".join(statistics_header)))
    statistics = general_utilities.Time_Series_Statistics_Store(
        statistics_header)

    for episode in range(args.episodes):
        states = env.reset()
        # episode_losses = np.zeros(env.n)
        # episode_rewards = np.zeros(env.n)
        # collision_count = np.zeros(env.n)
        episode_losses = np.zeros(env.num_agents)
        episode_rewards = 0

        steps = 0

        all_states = [states]
        while steps <= 600:
            steps += 1

            # render
            # if args.render:
            #     env._render()

            # act
            actions = []
            # n represents agents' number
            for i in range(env.num_agents):
                action = dqns[i].choose_action(states)
                actions.append(action)

            # step
            states_next, rewards, done, info = env.step(actions)
            all_states.append(states_next)
            # learn
            if not args.testing:
                size = memories[0].pointer
                batch = random.sample(
                    range(size), size) if size < batch_size else random.sample(
                        range(size), batch_size)

                for i in range(env.num_agents):
                    memories[i].remember(states, actions[i], rewards,
                                         states_next, done)

                    if memories[i].pointer > batch_size * 10:
                        history = dqns[i].learn(*memories[i].sample(batch))
                        episode_losses[i] += history.history["loss"][0]
                    else:
                        for i in range(env.num_agents):
                            episode_losses[i] = -1

            states = states_next
            episode_rewards += rewards
            # reset states if done
            if done or steps >= 600:
                episode_losses = episode_losses / steps

                statistic = [episode]
                statistic.append(steps)
                statistic.append(done)
                statistic.append(episode_rewards)
                statistic.extend(
                    [episode_losses[i] for i in range(env.num_agents)])
                statistic.extend(
                    [dqns[i].eps_greedy for i in range(env.num_agents)])
                statistic.extend([env.B_k[i] for i in range(env.num_agents)])
                statistic.extend([env.T_i[i] for i in range(env.num_agents)])
                statistics.add_statistics(statistic)
                if episode % 1 == 0:
                    print(statistics.summarize_last())

                if done:
                    with open('/save/states/episode{}_states.txt'.format(
                            episode),
                              mode='w') as myfile:
                        for each in all_states:
                            myfile.write(each)
                            myfile.write('\n')
                    myfile.close()
                break

        if episode % checkpoint_interval == 0:
            statistics.dump("{}_{}.csv".format(csv_filename_prefix, episode))
            general_utilities.save_dqn_weights(
                dqns, "{}_{}_".format(weights_filename_prefix, episode))
            if episode >= checkpoint_interval:
                os.remove("{}_{}.csv".format(csv_filename_prefix,
                                             episode - checkpoint_interval))

    return statistics
Exemplo n.º 5
0
def play(episodes, is_render, is_testing, checkpoint_interval,
         weights_filename_prefix, csv_filename_prefix, batch_size):
    # init statistics. NOTE: simple tag specific!
    statistics_header = ["episode"]
    statistics_header.append("steps")
    statistics_header.extend(["reward_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["loss_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["eps_greedy_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["collisions_{}".format(i) for i in range(env.n)])
    print("Collecting statistics {}:".format(" ".join(statistics_header)))
    statistics = general_utilities.Time_Series_Statistics_Store(
        statistics_header)

    for episode in range(args.episodes):
        states = env.reset()
        episode_losses = np.zeros(env.n)
        episode_rewards = np.zeros(env.n)
        collision_count = np.zeros(env.n)
        steps = 0

        while True:
            steps += 1

            # render
            if args.render:
                env.render()

            # act
            actions = np.zeros(env.n)
            actions_onehot = []
            action = dqn.choose_action(states[0])
            speed = 0.9

            #distribute actions to two players
            if action == 0:
                actions[0] = 0
                actions[1] = 0
            elif action == 1:
                actions[0] = 0
                actions[1] = 1
            elif action == 2:
                actions[0] = 0
                actions[1] = 2
            elif action == 3:
                actions[0] = 0
                actions[1] = 3
            elif action == 4:
                actions[0] = 0
                actions[1] = 4

            elif action == 5:
                actions[0] = 1
                actions[1] = 0
            elif action == 6:
                actions[0] = 1
                actions[1] = 1
            elif action == 7:
                actions[0] = 1
                actions[1] = 2
            elif action == 8:
                actions[0] = 1
                actions[1] = 3
            elif action == 9:
                actions[0] = 1
                actions[1] = 4

            elif action == 10:
                actions[0] = 2
                actions[1] = 0
            elif action == 11:
                actions[0] = 2
                actions[1] = 1
            elif action == 12:
                actions[0] = 2
                actions[1] = 2
            elif action == 13:
                actions[0] = 2
                actions[1] = 3
            elif action == 14:
                actions[0] = 2
                actions[1] = 4

            elif action == 15:
                actions[0] = 3
                actions[1] = 0
            elif action == 16:
                actions[0] = 3
                actions[1] = 1
            elif action == 17:
                actions[0] = 3
                actions[1] = 2
            elif action == 18:
                actions[0] = 3
                actions[1] = 3
            elif action == 19:
                actions[0] = 3
                actions[1] = 4

            elif action == 20:
                actions[0] = 4
                actions[1] = 0
            elif action == 21:
                actions[0] = 4
                actions[1] = 1
            elif action == 22:
                actions[0] = 4
                actions[1] = 2
            elif action == 23:
                actions[0] = 4
                actions[1] = 3
            elif action == 24:
                actions[0] = 4
                actions[1] = 4

            actions = actions.astype(int)
            for i in range(env.n):
                onehot_action = np.zeros(n_actions)
                onehot_action[actions[i]] = speed
                actions_onehot.append(onehot_action)

            # step
            states_next, rewards, done, info = env.step(actions_onehot)

            reward_cal = rewards[0] + rewards[1]

            # learn
            if not args.testing:
                size = memories.pointer
                batch = random.sample(
                    range(size), size) if size < batch_size else random.sample(
                        range(size), batch_size)

                done_cal = np.logical_and(done[0], done[1])
                memories.remember(states[0], action, reward_cal,
                                  states_next[0], done_cal)

                if memories.pointer > batch_size * 10:
                    history = dqn.learn(*memories.sample(batch))
                    episode_losses[0] += history.history["loss"][0]
                else:
                    episode_losses[0] = -1

            states = states_next
            episode_rewards += rewards
            collision_count += np.array(
                new_alg_utilities.count_agent_collisions(env))

            # reset states if done
            if any(done):
                episode_rewards = episode_rewards / steps
                episode_losses = episode_losses / steps

                statistic = [episode]
                statistic.append(steps)
                statistic.extend([episode_rewards[i] for i in range(env.n)])
                statistic.extend([episode_losses[i] for i in range(env.n)])
                statistic.extend([dqn.eps_greedy for i in range(env.n)])
                statistic.extend(collision_count.tolist())
                statistics.add_statistics(statistic)
                if episode % 25 == 0:
                    print(statistics.summarize_last())
                break

        if episode % checkpoint_interval == 0:
            statistics.dump("{}_{}.csv".format(csv_filename_prefix, episode))
            #general_utilities.save_dqn_weights(dqn,
        #                                    "{}_{}_".format(weights_filename_prefix, episode))
        #if episode >= checkpoint_interval:
        #  os.remove("{}_{}.csv".format(csv_filename_prefix,
        #                               episode - checkpoint_interval))

    return statistics
Exemplo n.º 6
0
        DQN(n_actions[i], state_sizes[i], eps_greedy=epsilon_greedy[i])
        for i in range(env.n)
    ]

    general_utilities.load_dqn_weights_if_exist(
        dqns, args.experiment_prefix + args.weights_filename_prefix)

    start_time = time.time()

    statistics_header = ["episode"]
    statistics_header.append("steps")
    statistics_header.extend(["reward_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["loss_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["eps_greedy_{}".format(i) for i in range(env.n)])
    print("Collecting statistics {}:".format(" ".join(statistics_header)))
    statistics = general_utilities.Time_Series_Statistics_Store(
        statistics_header)

    for episode in range(args.episodes):
        states = env.reset()
        episode_losses = np.zeros(env.n)
        episode_rewards = np.zeros(env.n)
        steps = 0

        while True:
            steps += 1

            # render
            if args.render:
                env.render()

            # act