Exemplo n.º 1
0
class Q_learning:
    def __init__(self, index, writer_w=None, information_log_w=None, init_func=init_function, nb_action=81, action_func=action_function, network=None):
        self.action_list = action_func(nb_action=nb_action)  # the list of action
        self.q_table = init_func(nb_action=nb_action)  # q table
        self.state = nb_action  # the current state of actor
        self.charging_time = [0.0 for _ in self.action_list]  # the list of charging time at each action
        self.reward = np.asarray([0.0 for _ in self.action_list])  # the reward of each action
        self.reward_max = [0.0 for _ in self.action_list]  # the maximum reward of each action
        self.reg = Regression(startAt=0)
        self.W = np.array([0.5, 0.5])
        self.reg_number = 0  # number of data saved in file log
        self.index = index
        self.writer_w = writer_w
        self.information_log_w = information_log_w
        print(self.writer_w)


    def update(self, network, mc_current_location=None,alpha=0.4, gamma=0.5, q_max_func=q_max_function, reward_func=reward_function):
        if not len(network.mc.list_request):
            return self.action_list[self.state], 0.0, 0.0, 0.0
        first, second =  self.set_reward(reward_func=reward_func, network=network, location=mc_current_location)
        self.q_table[self.state] = (1 - alpha) * self.q_table[self.state] + alpha * (
                self.reward + gamma * self.q_max(q_max_func))
        self.choose_next_state(network)
        if self.state == len(self.action_list) - 1:
            charging_time = (network.mc.capacity - network.mc.energy) / network.mc.e_self_charge
        else:
            charging_time = self.charging_time[self.state]
        print("next state =", self.action_list[self.state], self.state, charging_time)
        print(self.charging_time)
        return self.action_list[self.state], charging_time, first[self.state], second[self.state]

    def q_max(self, q_max_func=q_max_function):
        return q_max_func(q_table=self.q_table, state=self.state)

    def set_reward(self, reward_func=reward_function, network=None, location=None):

        first = np.asarray([0.0 for _ in self.action_list], dtype=float)
        second = np.asarray([0.0 for _ in self.action_list], dtype=float)

        # third = np.asarray([0.0 for _ in self.action_list], dtype=float)
        for index, row in enumerate(self.q_table):
            temp = reward_func(network=network, q_learning=self, state=index, receive_func=find_receiver)
            first[index] = temp[0]
            second[index] = temp[1]
            # third[index] = temp[2]
            self.charging_time[index] = temp[2]
        first = first / (np.sum(first) + 1e-8)
        second = second / (np.sum(second) + 1e-8)
        print("[INFO] First, Second", first, second)
        # third = third / np.sum(third)


        with open(para.log_dir + str(para.ID_run) + "/regression_data"+str(self.index)+".csv", 'r') as csvfile:
            csv_dict = [row for row in csv.DictReader(csvfile)]
            print("[INFO] Length file", len(csv_dict))
            if len(csv_dict) != 0:

                self.reg.read_data(train_filename=para.log_dir + str(para.ID_run) + "/regression_data"+str(self.index)+".csv", target_filename=para.log_dir + str(para.ID_run) + "/regression_target_data"+str(self.index)+".csv")
                print("[INFO] Length truth: ", len(self.reg.delta))
                print("[INFO] Para X", para.X)
        print("[INFO] location", location)
        if para.X != 0:
            if((len(self.reg.delta)-1) % para.X == 0) and len(self.reg.delta) != 1 and location !=para.depot:
                print("[INFO] Update")
                print("[INFO] StartAT: ", self.reg.startAt)
                self.W = self.reg.update()
                print("[INFO] StartAT: ", self.reg.startAt)
                self.W = self.W / (np.sum(self.W) + 1e-8)
                print("Parameters: ", self.W)
                self.writer_w.writerow({"Weights": self.W})
                self.information_log_w.flush()
        self.reward = self.W[0] * first + self.W[1] * second
        self.reward_max = list(zip(first, second))
        return first, second

    def choose_next_state(self, network):
        # next_state = np.argmax(self.q_table[self.state])
        if network.mc.energy < 10:
            self.state = len(self.q_table) - 1
        else:
            self.state = np.argmax(self.q_table[self.state])