def update_Q_table(self, observation): index_1 = state2index_3(L=self.state[-3], O=self.state[-4]) index_2 = state2index_3(L=self.state[-1], O=self.state[-2]) if observation == 'B' or observation == 'S': reward = 1 else: reward = 0 temp = self.learning_rate \ * (reward + np.max(self.Q_table[index_2]) - self.Q_table[index_1][self.action] - self.rho) self.Q_table[index_1][self.action] += temp self.rho += temp
def update_Q_table(self, observation): index_1 = state2index_3(L=self.state[-3], O=self.state[-4]) index_2 = state2index_3(L=self.state[-1], O=self.state[-2]) if observation == 'B' or observation == 'S': reward = 10 elif observation == 'F' and self.action == 1: # collsion or channel reward = -5 elif observation == 'I' and self.queue[0] == 1: # need to transmit reward = -3 else: # 'F' and action = 0, 'I' and no packet reward = 2 temp = self.learning_rate \ * (reward + np.max(self.Q_table[index_2]) - self.Q_table[index_1][self.action] - self.rho) self.Q_table[index_1][self.action] += temp self.rho += temp
def select_action(self, observation): # 0 is wait, 1 is transmit ########################################### index = state2index_3(L=self.queue, O=observation) self.epsilon *= self.epsilon_decay self.epsilon = max(self.epsilon_min, self.epsilon) if self.epsilon > np.random.uniform(): self.action = round(np.random.uniform()) else: self.action = np.argmax(self.Q_table[index]) if sum(self.queue) == 0: self.action = 0