def main():
    # ============= Initialize variables and objects ===========#
    max_mean_reward = MAIN_PARAMS["MAX_MEAN_REWARD"]
    environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS)
    agent = Agent(AGENT_PARAMS)
    mean_episode = MAIN_PARAMS["MEAN_EPISODE"]
    episodes = MAIN_PARAMS["EPISODES"]

    all_rewards = []
    all_mean_rewards = []
    t_mean = []

    # ================= Running episodes =================#
    try:
        for e in range(episodes):
            states, episode_reward = environment.reset()  # Reset level in tank
            for t in range(MAIN_PARAMS["MAX_TIME"]):
                actions = agent.act(states[-1])  # get action choice from state
                z = agent.get_z(actions)

                terminated, next_state = environment.get_next_state(
                    z, states[-1], t
                )  # Calculate next state with action
                rewards = sum_rewards(
                    next_state, terminated, get_reward
                )  # get reward from transition to next state

                # Store data
                rewards = sum_rewards(next_state, terminated, get_reward)
                rewards.append(np.sum(rewards))
                episode_reward.append(rewards)

                states.append(next_state)
                agent.remember(states, rewards, terminated, t)

                if environment.show_rendering:
                    environment.render(z)
                if True in terminated:
                    break

            episode_reward = np.array(episode_reward)
            episode_total_reward = []
            t_mean.append(t)
            for i in range(environment.n_tanks + 1):
                episode_total_reward.append(sum(episode_reward[:, i]))
            all_rewards.append(episode_total_reward)

            # Print mean reward and save better models
            if e % mean_episode == 0 and e != 0:
                mean_reward = np.array(all_rewards[-mean_episode:])
                mean_r = []
                t_mean = int(np.mean(t_mean))
                for i in range(environment.n_tanks + 1):
                    mean_r.append(np.mean(mean_reward[:, i]))
                all_mean_rewards.append(mean_r)
                print(
                    f"Mean {mean_episode} of {e}/{episodes} episodes ### timestep {t_mean+1} ### tot reward: {mean_r[-1]}  \
                    r1: {mean_r[0]} ex1: {round(agent.epsilon[0],2)} r2: {mean_r[1]} ex2: {round(agent.epsilon[1],2)}"
                )
                t_mean = []
                if mean_r[-1] >= max_mean_reward:
                    agent.save_trained_model()
                    max_mean_reward = mean_r[-1]
                # Train model
            if agent.is_ready():
                agent.Qreplay(e)

            if not environment.running:
                break
            # if agent.epsilon <= agent.epsilon_min:
            #     break
    except KeyboardInterrupt:
        pass
    print("Memory length: {}".format(len(agent.memory)))
    print("##### {} EPISODES DONE #####".format(e + 1))
    print("Max rewards for all episodes: {}".format(np.max(all_rewards)))

    all_mean_rewards = np.array(all_mean_rewards)
    labels = ["Tank 1", "Tank 2"]

    for i in range(environment.n_tanks):
        plt.plot(all_mean_rewards[:, i], label=labels[i])
    plt.legend()
    plt.show()

    plt.plot(all_mean_rewards[:, -1], label="Total rewards")
    plt.ylabel("Mean rewards of last {} episodes".format(mean_episode))
    plt.legend()
    plt.show()
示例#2
0
def main():
    # ============= Initialize variables and objects ===========#
    max_mean_reward = 50 * len(TANK_PARAMS)
    environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS)
    agent = Agent(AGENT_PARAMS)
    mean_episode = MAIN_PARAMS["MEAN_EPISODE"]
    episodes = MAIN_PARAMS["EPISODES"]
    all_rewards = []
    all_mean_rewards = []

    # ================= Running episodes =================#
    try:
        for e in range(episodes):
            states, episode_reward = environment.reset()  # Reset level in tank
            for t in range(MAIN_PARAMS["MAX_TIME"]):
                actions = agent.act(states[-1])  # get action choice from state
                z = agent.get_z(actions)

                terminated, next_state = environment.get_next_state(
                    z, states[-1], t)  # Calculate next state with action
                rewards = sum_rewards(
                    next_state, terminated,
                    get_reward)  # get reward from transition to next state

                # Store data
                episode_reward.append(np.sum(rewards))

                states.append(next_state)
                agent.remember(states, rewards, terminated, t)

                if environment.show_rendering:
                    environment.render(z)
                if True in terminated:
                    break

            all_rewards.append(np.sum(np.array(episode_reward)))

            # Print mean reward and save better models
            if e % mean_episode == 0 and e != 0:
                mean_reward = np.mean(all_rewards[-mean_episode:])
                all_mean_rewards.append(mean_reward)
                print("{} of {}/{} episodes\
                     reward: {} exp_1: {} exp_2: {}".format(
                    mean_episode,
                    e,
                    episodes,
                    round(mean_reward, 2),
                    round(agent.epsilon[0], 2),
                    round(agent.epsilon[1], 2),
                ))
                if agent.save_model_bool:
                    max_mean_reward = agent.save_model(mean_reward,
                                                       max_mean_reward)
                # Train model
            if agent.is_ready():
                agent.Qreplay(e)

            if keyboard.is_pressed("ctrl+x"):
                break

            if environment.live_plot:
                environment.plot(all_rewards, agent.epsilon)
            if not environment.running:
                break
            # if agent.epsilon <= agent.epsilon_min:
            #     break
    except KeyboardInterrupt:
        pass
    print("Memory length: {}".format(len(agent.memory)))
    print("##### {} EPISODES DONE #####".format(e + 1))
    print("Max rewards for all episodes: {}".format(np.max(all_rewards)))
    plt.ioff()
    plt.clf()
    x_range = np.arange(0, e - e % mean_episode, mean_episode)
    plt.plot(x_range, all_mean_rewards)
    plt.ylabel("Mean rewards of last {} episodes".format(mean_episode))
    plt.show()
示例#3
0
def main():
    # ============= Initialize variables and objects ===========#
    environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS)
    agent = Agent(AGENT_PARAMS)
    z = []
    h = []
    d = []
    # ================= Running episodes =================#

    state, episode_reward = environment.reset()
    h_ = np.array([state[0][i][0] for i in range(6)])
    h.append(h_)
    for t in range(MAIN_PARAMS["MAX_TIME"]):
        action = agent.act(state[-1])  # get action choice from state
        z_ = agent.action_choices[
            action]  # convert action choice into valve position
        z.append(np.array(z_))
        terminated, next_state = environment.get_next_state(
            z[-1], state[-1], t)  # Calculate next state with action
        reward = sum_rewards(
            next_state, terminated,
            get_reward)  # get reward from transition to next state
        # Store data
        episode_reward.append(reward)

        state.append(next_state)
        h_ = []
        d_ = []
        for i in range(agent.n_tanks):
            d_.append(environment.tanks[i].dist.flow[t - 1] +
                      environment.q_inn[i])
            h_.append(np.array(next_state[i][0]))
        d.append(d_)
        h.append(h_)
        if environment.show_rendering:
            environment.render(z[-1])
        if True in terminated:
            break

        if not environment.running:
            break
    colors = [
        "peru",
        "firebrick",
        "darkslategray",
        "darkviolet",
        "mediumseagreen",
        "darkcyan",
    ]
    print(f"reward: {np.sum(episode_reward)}")
    h = np.array(h) * 10
    d = np.array(d)
    z = np.array(z)
    for i in range(2):
        _, (ax1, ax2, ax3) = plt.subplots(3, sharex=False, sharey=False)
        ax1.plot(
            h[1:-1, 0 + i * 3],
            color=colors[0 + i * 3],
            label="Tank {}".format(str(1 + i * 3)),
        )
        ax1.plot(
            h[1:-1, 1 + i * 3],
            color=colors[1 + i * 3],
            label="Tank {}".format(str(2 + i * 3)),
        )
        ax1.plot(
            h[1:-1, 2 + i * 3],
            color=colors[2 + i * 3],
            label="Tank {}".format(str(3 + i * 3)),
        )
        ax1.set_ylabel("Level")
        ax1.legend(loc="upper left")
        ax1.set_ylim(2.5, 7.5)

        ax2.plot(
            z[1:, 0 + i * 3],
            color=colors[0 + i * 3],
            label="Tank {}".format(str(1 + i * 3)),
        )
        ax2.plot(
            z[1:, 1 + i * 3],
            color=colors[1 + i * 3],
            label="Tank {}".format(str(2 + i * 3)),
        )
        ax2.plot(
            z[1:, 2 + i * 3],
            color=colors[2 + i * 3],
            label="Tank {}".format(str(3 + i * 3)),
        )
        ax2.set_ylabel("Valve")
        ax2.legend(loc="upper left")
        ax2.set_ylim(-0.01, 1.01)

        ax3.plot(
            d[1:-1, 0 + i * 3],
            color=colors[0 + i * 3],
            label="Tank {}".format(str(1 + i * 3)),
        )
        ax3.plot(
            d[1:-1, 1 + i * 3],
            color=colors[1 + i * 3],
            label="Tank {}".format(str(2 + i * 3)),
        )
        ax3.plot(
            d[1:-1, 2 + i * 3],
            color=colors[2 + i * 3],
            label="Tank {}".format(str(3 + i * 3)),
        )
        ax3.set_ylabel("Disturbance")
        ax3.legend(loc="upper left")
        ax3.set_ylim(-0.01, 4)

        plt.tight_layout()
        plt.xlabel("Time")
        plt.show()
def main(tau_c_tuning=30, tuning_number=None, plot=True):
    environment = Environment(TANK_PARAMS_LIST, TANK_DIST_LIST, MAIN_PARAMS)

    controllers = []
    for i, AGENT_PARAMS in enumerate(AGENT_PARAMS_LIST):
        controller = P_controller(environment, AGENT_PARAMS, i)
        controllers.append(controller)
    if tuning_number is not None:
        controllers[tuning_number].evalv_kc(tau_c_tuning)

    init_h = []
    for tank in environment.tanks:
        init_h.append(tank.level)

    init_z = []
    for AGENT_PARAMS in AGENT_PARAMS_LIST:
        init_z.append(AGENT_PARAMS["INIT_POSITION"])

    init_d = []
    for TANK_DIST in TANK_DIST_LIST:
        init_d.append(TANK_DIST["nom_flow"])

    h = [init_h]
    z = [init_z]
    d = [init_d]

    max_time = MAIN_PARAMS["MAX_TIME"]
    episode_reward = []
    for t in range(max_time):
        new_z = []
        new_h = []
        q_out = [0]
        for i, controller in enumerate(controllers):
            new_z_ = controller.get_z(h[-1][i])
            new_z.append(new_z_)
            new_h_, q_out_ = environment.get_next_state(
                z[-1][i], i, t, q_out[i]
            )
            new_h.append(new_h_)
            q_out.append(q_out_)
        z.append(new_z)
        h.append(new_h)

        new_d = []
        for i, TANK_DIST in enumerate(TANK_DIST_LIST):
            if TANK_DIST["add"]:
                new_d_ = environment.tanks[i].dist.flow[t]
                new_d.append(new_d_ + q_out[i])
            else:
                new_d.append(q_out[i])
        d.append(new_d)

        reward = sum_rewards(
            new_h, [False], get_reward
        )  # get reward from transition to next state
        episode_reward.append(reward)
        if environment.show_rendering:
            environment.render(z[-1])
    if plot:
        print(f"Reward: {np.sum(episode_reward)}")
        _, (ax1, ax2, ax3) = plt.subplots(3, sharex=False, sharey=False)
        h = np.array(h)
        d = np.array(d)
        z = np.array(z)
        ax1.plot(h[:-1, 0], color="peru", label="Tank 1")
        ax1.set_ylabel("Level")
        ax1.legend(loc="upper left")
        ax1.set_ylim(2.5, 7.5)

        ax2.plot(z[1:, 0], color="peru", label="Tank 1")
        ax2.legend(loc="upper left")
        ax2.set_ylabel("Valve")
        ax2.set_ylim(-0.01, 1.01)

        ax3.plot(d[2:, 0], color="peru", label="Tank 1")
        ax3.set_ylabel("Disturbance")
        ax3.legend(loc="upper left")
        ax3.set_ylim(0, 4)

        plt.tight_layout()
        plt.xlabel("Time")
        plt.show()
    episode_reward = np.array(episode_reward)
    return np.sum(episode_reward[:, tuning_number])
def main():
    # ============= Initialize variables and objects ===========#
    environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS)
    agent = Agent(AGENT_PARAMS)
    z = []
    h = []
    d = []
    # ================= Running episodes =================#

    state, episode_reward = environment.reset()
    h_ = np.array([state[0][0][0]])
    h.append(h_)
    for t in range(MAIN_PARAMS["MAX_TIME"]):
        z_ = agent.act(state[-1])  # get action choice from state
        z.append(np.array(z_))
        terminated, next_state = environment.get_next_state(
            z[-1], state[-1], t
        )  # Calculate next state with action
        reward = sum_rewards(next_state, terminated, get_reward)  # get reward from transition to next state
        # Store data
        episode_reward.append(reward)

        state.append(next_state)
        h_ = []
        d_ = []
        for i in range(agent.n_tanks):
            d_.append(environment.tanks[i].dist.flow[t] + environment.q_inn[i])
            h_.append(np.array(next_state[i][0]))
        d.append(d_)
        h.append(h_)
        if environment.show_rendering:
            environment.render(z[-1])
        if True in terminated:
            break

        if not environment.running:
            break
    print(np.sum(episode_reward))

    _, (ax1, ax2, ax3) = plt.subplots(3, sharex=False, sharey=False)
    d = np.array(d)
    h = np.array(h[:-1])
    z = np.array(z)
    h *= 10

    ax1.plot(h[:-1, 0], color="peru", label="Tank 1")
    ax1.set_ylabel("Level")
    ax1.legend(loc="upper right")
    ax1.set_ylim(2.5, 7.5)

    ax2.plot(z[1:, 0], color="peru", label="Tank 1")
    ax2.legend(loc="upper right")
    ax2.set_ylabel("Valve")
    ax2.set_ylim(0, 1.01)

    ax3.plot(d[:, 0], color="peru", label="Tank 1")
    ax3.set_ylim(0, 4)
    ax3.set_ylabel("Disturbance")
    ax3.legend(loc="upper right")

    # plt.legend([l1, l2, l3], ["Tank height", "Valve position", "Disturbance"])
    plt.tight_layout()
    plt.xlabel("Time")
    plt.show()
def main():
    # ============= Initialize variables and objects ===========#
    max_mean_reward = MAIN_PARAMS["MAX_MEAN_REWARD"]
    environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS)
    agent = Agent(AGENT_PARAMS)
    mean_episode = MAIN_PARAMS["MEAN_EPISODE"]
    episodes = MAIN_PARAMS["EPISODES"]

    all_rewards = []
    all_mean_rewards = []
    t_mean = []

    # ================= Running episodes =================#
    try:
        for e in range(episodes):
            states, episode_reward = environment.reset()  # Reset level in tank

            for t in range(MAIN_PARAMS["MAX_TIME"]):
                z = agent.act(states[-1])  # get action choice from state

                terminated, next_state = environment.get_next_state(
                    z, states[-1], t)
                states.append(next_state)

                rewards = sum_rewards(next_state, terminated, get_reward)
                rewards.append(np.sum(rewards))
                episode_reward.append(rewards)

                agent.remember(states, rewards, terminated, t)

                if environment.show_rendering:
                    environment.render(z)
                if True in terminated:
                    break

            # Collect summary of episode
            episode_reward = np.array(episode_reward)
            episode_total_reward = []
            t_mean.append(t)
            for i in range(environment.n_tanks + 1):
                episode_total_reward.append(sum(episode_reward[:, i]))
            all_rewards.append(episode_total_reward)

            # Print mean reward and save better models
            if e % mean_episode == 0 and e != 0:
                mean_reward = np.array(all_rewards[-mean_episode:])
                mean_r = []
                t_mean = int(np.mean(t_mean))
                for i in range(environment.n_tanks + 1):
                    mean_r.append(np.mean(mean_reward[:, i]))
                all_mean_rewards.append(mean_r)
                print(
                    f"Mean {mean_episode} of {e}/{episodes} episodes ### timestep {t_mean+1} ### tot reward: {mean_r[-1]}"
                )
                t_mean = []
                if mean_r[-1] >= max_mean_reward:
                    agent.save_trained_model()
                    max_mean_reward = mean_r[-1]
            agent.PolicyGradientReplay(e)

            if not environment.running:
                break

    except KeyboardInterrupt:
        pass
    print("Memory length: {}".format(len(agent.memory)))
    print("##### {} EPISODES DONE #####".format(e + 1))
    print("Max rewards for all episodes: {}".format(np.max(all_rewards)))

    all_mean_rewards = np.array(all_mean_rewards)

    plt.plot(all_mean_rewards[:, -1], label="Total rewards")
    plt.ylabel("Mean rewards of last {} episodes".format(mean_episode))
    plt.legend()
    plt.show()