def main(): # ============= Initialize variables and objects ===========# max_mean_reward = MAIN_PARAMS["MAX_MEAN_REWARD"] environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS) agent = Agent(AGENT_PARAMS) mean_episode = MAIN_PARAMS["MEAN_EPISODE"] episodes = MAIN_PARAMS["EPISODES"] all_rewards = [] all_mean_rewards = [] t_mean = [] # ================= Running episodes =================# try: for e in range(episodes): states, episode_reward = environment.reset() # Reset level in tank for t in range(MAIN_PARAMS["MAX_TIME"]): actions = agent.act(states[-1]) # get action choice from state z = agent.get_z(actions) terminated, next_state = environment.get_next_state( z, states[-1], t ) # Calculate next state with action rewards = sum_rewards( next_state, terminated, get_reward ) # get reward from transition to next state # Store data rewards = sum_rewards(next_state, terminated, get_reward) rewards.append(np.sum(rewards)) episode_reward.append(rewards) states.append(next_state) agent.remember(states, rewards, terminated, t) if environment.show_rendering: environment.render(z) if True in terminated: break episode_reward = np.array(episode_reward) episode_total_reward = [] t_mean.append(t) for i in range(environment.n_tanks + 1): episode_total_reward.append(sum(episode_reward[:, i])) all_rewards.append(episode_total_reward) # Print mean reward and save better models if e % mean_episode == 0 and e != 0: mean_reward = np.array(all_rewards[-mean_episode:]) mean_r = [] t_mean = int(np.mean(t_mean)) for i in range(environment.n_tanks + 1): mean_r.append(np.mean(mean_reward[:, i])) all_mean_rewards.append(mean_r) print( f"Mean {mean_episode} of {e}/{episodes} episodes ### timestep {t_mean+1} ### tot reward: {mean_r[-1]} \ r1: {mean_r[0]} ex1: {round(agent.epsilon[0],2)} r2: {mean_r[1]} ex2: {round(agent.epsilon[1],2)}" ) t_mean = [] if mean_r[-1] >= max_mean_reward: agent.save_trained_model() max_mean_reward = mean_r[-1] # Train model if agent.is_ready(): agent.Qreplay(e) if not environment.running: break # if agent.epsilon <= agent.epsilon_min: # break except KeyboardInterrupt: pass print("Memory length: {}".format(len(agent.memory))) print("##### {} EPISODES DONE #####".format(e + 1)) print("Max rewards for all episodes: {}".format(np.max(all_rewards))) all_mean_rewards = np.array(all_mean_rewards) labels = ["Tank 1", "Tank 2"] for i in range(environment.n_tanks): plt.plot(all_mean_rewards[:, i], label=labels[i]) plt.legend() plt.show() plt.plot(all_mean_rewards[:, -1], label="Total rewards") plt.ylabel("Mean rewards of last {} episodes".format(mean_episode)) plt.legend() plt.show()
def main(): # ============= Initialize variables and objects ===========# max_mean_reward = 50 * len(TANK_PARAMS) environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS) agent = Agent(AGENT_PARAMS) mean_episode = MAIN_PARAMS["MEAN_EPISODE"] episodes = MAIN_PARAMS["EPISODES"] all_rewards = [] all_mean_rewards = [] # ================= Running episodes =================# try: for e in range(episodes): states, episode_reward = environment.reset() # Reset level in tank for t in range(MAIN_PARAMS["MAX_TIME"]): actions = agent.act(states[-1]) # get action choice from state z = agent.get_z(actions) terminated, next_state = environment.get_next_state( z, states[-1], t) # Calculate next state with action rewards = sum_rewards( next_state, terminated, get_reward) # get reward from transition to next state # Store data episode_reward.append(np.sum(rewards)) states.append(next_state) agent.remember(states, rewards, terminated, t) if environment.show_rendering: environment.render(z) if True in terminated: break all_rewards.append(np.sum(np.array(episode_reward))) # Print mean reward and save better models if e % mean_episode == 0 and e != 0: mean_reward = np.mean(all_rewards[-mean_episode:]) all_mean_rewards.append(mean_reward) print("{} of {}/{} episodes\ reward: {} exp_1: {} exp_2: {}".format( mean_episode, e, episodes, round(mean_reward, 2), round(agent.epsilon[0], 2), round(agent.epsilon[1], 2), )) if agent.save_model_bool: max_mean_reward = agent.save_model(mean_reward, max_mean_reward) # Train model if agent.is_ready(): agent.Qreplay(e) if keyboard.is_pressed("ctrl+x"): break if environment.live_plot: environment.plot(all_rewards, agent.epsilon) if not environment.running: break # if agent.epsilon <= agent.epsilon_min: # break except KeyboardInterrupt: pass print("Memory length: {}".format(len(agent.memory))) print("##### {} EPISODES DONE #####".format(e + 1)) print("Max rewards for all episodes: {}".format(np.max(all_rewards))) plt.ioff() plt.clf() x_range = np.arange(0, e - e % mean_episode, mean_episode) plt.plot(x_range, all_mean_rewards) plt.ylabel("Mean rewards of last {} episodes".format(mean_episode)) plt.show()
def main(): # ============= Initialize variables and objects ===========# environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS) agent = Agent(AGENT_PARAMS) z = [] h = [] d = [] # ================= Running episodes =================# state, episode_reward = environment.reset() h_ = np.array([state[0][i][0] for i in range(6)]) h.append(h_) for t in range(MAIN_PARAMS["MAX_TIME"]): action = agent.act(state[-1]) # get action choice from state z_ = agent.action_choices[ action] # convert action choice into valve position z.append(np.array(z_)) terminated, next_state = environment.get_next_state( z[-1], state[-1], t) # Calculate next state with action reward = sum_rewards( next_state, terminated, get_reward) # get reward from transition to next state # Store data episode_reward.append(reward) state.append(next_state) h_ = [] d_ = [] for i in range(agent.n_tanks): d_.append(environment.tanks[i].dist.flow[t - 1] + environment.q_inn[i]) h_.append(np.array(next_state[i][0])) d.append(d_) h.append(h_) if environment.show_rendering: environment.render(z[-1]) if True in terminated: break if not environment.running: break colors = [ "peru", "firebrick", "darkslategray", "darkviolet", "mediumseagreen", "darkcyan", ] print(f"reward: {np.sum(episode_reward)}") h = np.array(h) * 10 d = np.array(d) z = np.array(z) for i in range(2): _, (ax1, ax2, ax3) = plt.subplots(3, sharex=False, sharey=False) ax1.plot( h[1:-1, 0 + i * 3], color=colors[0 + i * 3], label="Tank {}".format(str(1 + i * 3)), ) ax1.plot( h[1:-1, 1 + i * 3], color=colors[1 + i * 3], label="Tank {}".format(str(2 + i * 3)), ) ax1.plot( h[1:-1, 2 + i * 3], color=colors[2 + i * 3], label="Tank {}".format(str(3 + i * 3)), ) ax1.set_ylabel("Level") ax1.legend(loc="upper left") ax1.set_ylim(2.5, 7.5) ax2.plot( z[1:, 0 + i * 3], color=colors[0 + i * 3], label="Tank {}".format(str(1 + i * 3)), ) ax2.plot( z[1:, 1 + i * 3], color=colors[1 + i * 3], label="Tank {}".format(str(2 + i * 3)), ) ax2.plot( z[1:, 2 + i * 3], color=colors[2 + i * 3], label="Tank {}".format(str(3 + i * 3)), ) ax2.set_ylabel("Valve") ax2.legend(loc="upper left") ax2.set_ylim(-0.01, 1.01) ax3.plot( d[1:-1, 0 + i * 3], color=colors[0 + i * 3], label="Tank {}".format(str(1 + i * 3)), ) ax3.plot( d[1:-1, 1 + i * 3], color=colors[1 + i * 3], label="Tank {}".format(str(2 + i * 3)), ) ax3.plot( d[1:-1, 2 + i * 3], color=colors[2 + i * 3], label="Tank {}".format(str(3 + i * 3)), ) ax3.set_ylabel("Disturbance") ax3.legend(loc="upper left") ax3.set_ylim(-0.01, 4) plt.tight_layout() plt.xlabel("Time") plt.show()
def main(tau_c_tuning=30, tuning_number=None, plot=True): environment = Environment(TANK_PARAMS_LIST, TANK_DIST_LIST, MAIN_PARAMS) controllers = [] for i, AGENT_PARAMS in enumerate(AGENT_PARAMS_LIST): controller = P_controller(environment, AGENT_PARAMS, i) controllers.append(controller) if tuning_number is not None: controllers[tuning_number].evalv_kc(tau_c_tuning) init_h = [] for tank in environment.tanks: init_h.append(tank.level) init_z = [] for AGENT_PARAMS in AGENT_PARAMS_LIST: init_z.append(AGENT_PARAMS["INIT_POSITION"]) init_d = [] for TANK_DIST in TANK_DIST_LIST: init_d.append(TANK_DIST["nom_flow"]) h = [init_h] z = [init_z] d = [init_d] max_time = MAIN_PARAMS["MAX_TIME"] episode_reward = [] for t in range(max_time): new_z = [] new_h = [] q_out = [0] for i, controller in enumerate(controllers): new_z_ = controller.get_z(h[-1][i]) new_z.append(new_z_) new_h_, q_out_ = environment.get_next_state( z[-1][i], i, t, q_out[i] ) new_h.append(new_h_) q_out.append(q_out_) z.append(new_z) h.append(new_h) new_d = [] for i, TANK_DIST in enumerate(TANK_DIST_LIST): if TANK_DIST["add"]: new_d_ = environment.tanks[i].dist.flow[t] new_d.append(new_d_ + q_out[i]) else: new_d.append(q_out[i]) d.append(new_d) reward = sum_rewards( new_h, [False], get_reward ) # get reward from transition to next state episode_reward.append(reward) if environment.show_rendering: environment.render(z[-1]) if plot: print(f"Reward: {np.sum(episode_reward)}") _, (ax1, ax2, ax3) = plt.subplots(3, sharex=False, sharey=False) h = np.array(h) d = np.array(d) z = np.array(z) ax1.plot(h[:-1, 0], color="peru", label="Tank 1") ax1.set_ylabel("Level") ax1.legend(loc="upper left") ax1.set_ylim(2.5, 7.5) ax2.plot(z[1:, 0], color="peru", label="Tank 1") ax2.legend(loc="upper left") ax2.set_ylabel("Valve") ax2.set_ylim(-0.01, 1.01) ax3.plot(d[2:, 0], color="peru", label="Tank 1") ax3.set_ylabel("Disturbance") ax3.legend(loc="upper left") ax3.set_ylim(0, 4) plt.tight_layout() plt.xlabel("Time") plt.show() episode_reward = np.array(episode_reward) return np.sum(episode_reward[:, tuning_number])
def main(): # ============= Initialize variables and objects ===========# environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS) agent = Agent(AGENT_PARAMS) z = [] h = [] d = [] # ================= Running episodes =================# state, episode_reward = environment.reset() h_ = np.array([state[0][0][0]]) h.append(h_) for t in range(MAIN_PARAMS["MAX_TIME"]): z_ = agent.act(state[-1]) # get action choice from state z.append(np.array(z_)) terminated, next_state = environment.get_next_state( z[-1], state[-1], t ) # Calculate next state with action reward = sum_rewards(next_state, terminated, get_reward) # get reward from transition to next state # Store data episode_reward.append(reward) state.append(next_state) h_ = [] d_ = [] for i in range(agent.n_tanks): d_.append(environment.tanks[i].dist.flow[t] + environment.q_inn[i]) h_.append(np.array(next_state[i][0])) d.append(d_) h.append(h_) if environment.show_rendering: environment.render(z[-1]) if True in terminated: break if not environment.running: break print(np.sum(episode_reward)) _, (ax1, ax2, ax3) = plt.subplots(3, sharex=False, sharey=False) d = np.array(d) h = np.array(h[:-1]) z = np.array(z) h *= 10 ax1.plot(h[:-1, 0], color="peru", label="Tank 1") ax1.set_ylabel("Level") ax1.legend(loc="upper right") ax1.set_ylim(2.5, 7.5) ax2.plot(z[1:, 0], color="peru", label="Tank 1") ax2.legend(loc="upper right") ax2.set_ylabel("Valve") ax2.set_ylim(0, 1.01) ax3.plot(d[:, 0], color="peru", label="Tank 1") ax3.set_ylim(0, 4) ax3.set_ylabel("Disturbance") ax3.legend(loc="upper right") # plt.legend([l1, l2, l3], ["Tank height", "Valve position", "Disturbance"]) plt.tight_layout() plt.xlabel("Time") plt.show()
def main(): # ============= Initialize variables and objects ===========# max_mean_reward = MAIN_PARAMS["MAX_MEAN_REWARD"] environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS) agent = Agent(AGENT_PARAMS) mean_episode = MAIN_PARAMS["MEAN_EPISODE"] episodes = MAIN_PARAMS["EPISODES"] all_rewards = [] all_mean_rewards = [] t_mean = [] # ================= Running episodes =================# try: for e in range(episodes): states, episode_reward = environment.reset() # Reset level in tank for t in range(MAIN_PARAMS["MAX_TIME"]): z = agent.act(states[-1]) # get action choice from state terminated, next_state = environment.get_next_state( z, states[-1], t) states.append(next_state) rewards = sum_rewards(next_state, terminated, get_reward) rewards.append(np.sum(rewards)) episode_reward.append(rewards) agent.remember(states, rewards, terminated, t) if environment.show_rendering: environment.render(z) if True in terminated: break # Collect summary of episode episode_reward = np.array(episode_reward) episode_total_reward = [] t_mean.append(t) for i in range(environment.n_tanks + 1): episode_total_reward.append(sum(episode_reward[:, i])) all_rewards.append(episode_total_reward) # Print mean reward and save better models if e % mean_episode == 0 and e != 0: mean_reward = np.array(all_rewards[-mean_episode:]) mean_r = [] t_mean = int(np.mean(t_mean)) for i in range(environment.n_tanks + 1): mean_r.append(np.mean(mean_reward[:, i])) all_mean_rewards.append(mean_r) print( f"Mean {mean_episode} of {e}/{episodes} episodes ### timestep {t_mean+1} ### tot reward: {mean_r[-1]}" ) t_mean = [] if mean_r[-1] >= max_mean_reward: agent.save_trained_model() max_mean_reward = mean_r[-1] agent.PolicyGradientReplay(e) if not environment.running: break except KeyboardInterrupt: pass print("Memory length: {}".format(len(agent.memory))) print("##### {} EPISODES DONE #####".format(e + 1)) print("Max rewards for all episodes: {}".format(np.max(all_rewards))) all_mean_rewards = np.array(all_mean_rewards) plt.plot(all_mean_rewards[:, -1], label="Total rewards") plt.ylabel("Mean rewards of last {} episodes".format(mean_episode)) plt.legend() plt.show()