def update_Q_table(self, observation): index_1 = state2index_(L2=self.state[-3], O=self.state[-4]) index_2 = state2index_(L2=self.state[-1], O=self.state[-2]) if observation == 'B' or observation == 'S': reward = 1 else: reward = 0 self.Q_table[index_1][self.action] += self.learning_rate \ * (reward + self.gamma * np.max(self.Q_table[index_2]) - self.Q_table[index_1][self.action])
def select_action(self, observation): # 0 is wait, 1 is transmit ########################################### index = state2index_(L2=self.queue, O=observation) self.epsilon *= self.epsilon_decay self.epsilon = max(self.epsilon_min, self.epsilon) if self.epsilon > np.random.uniform(): self.action = round(np.random.uniform()) else: self.action = np.argmax(self.Q_table[index]) if sum(self.queue) == 0: self.action = 0