def __init__(self, mdp_world, critical_threshold, precision=0.0001, debug=False): self.mdp_world = mdp_world self.entropy_threshold = critical_threshold self.precision = precision self.debug = debug self.q_values = mdp.compute_q_values(mdp_world) self.optimal_policy = mdp.find_optimal_policy(mdp_world, Q=self.q_values) #find critical states if debug: print("finding critical states") self.critical_state_actions = [] for s in self.mdp_world.states: if debug: print(s) #calculate entropy of optimal policy (assumes it is stochastic optimal) num_optimal_actions = len(self.optimal_policy[s]) action_probs = np.zeros(len(self.mdp_world.actions(s))) for i in range(num_optimal_actions): action_probs[i] = 1.0 / num_optimal_actions entropy = utils.entropy(action_probs) if debug: print(s, entropy) best_action = utils.argmax(self.mdp_world.actions(s), lambda a: self.q_values[s, a]) if entropy < self.entropy_threshold: self.critical_state_actions.append((s, best_action))
def show_qtable(self): table = np.chararray((12, 12)) for i in range(self.n_states): if (max(self.qtable[i])) != 0: table[i // 12, i % 12] = BST_DIRECTIONS[argmax(self.qtable[i])] else: table[i // 12, i % 12] = "N" print(table) print(self.epsilon)
def categorize(self): scores = [] for qtype in self.question_types: patterns = qtype.get_pattern() score = self._score_pattern_matching(self.text_ru, patterns) scores.append(score) arg_i = utils.argmax(scores) categorized_qtype = self.question_types[arg_i] return categorized_qtype(self.text_ru)
def epsilonGreedy(self, q_values): a = argmax(q_values) if self.random_state.uniform(0, 1) < self.epsilon: a = self.random_state.randint(self.n_actions) return a
def predict(self, state): """ Greedily select action for a state """ q_values = self.qtable[state] return argmax(q_values), '' # for compatibilty
def is_agent_value_aligned(self, policy, agent_q_values, reward_weights): #Need to ask the agent what it would do in each setting. Need access to agent Q-values... for question in self.test: if self.debug: print("Testing question:") utils.print_question(question, self.mdp_world) if len(question) == 2: (s, worse), (s, better) = question if self.debug: print("Qw({},{}) = {}, \nQb({},{}) = {}".format( s, worse, agent_q_values[(s, worse)], s, better, agent_q_values[(s, better)])) #check if q-values match question answer #check if better action is optimal optimal_action = utils.argmax(self.mdp_world.actions(s), lambda a: agent_q_values[s, a]) optimal_qvalue = agent_q_values[s, optimal_action] #if better action q-value is not numerically significantly better, then fail the agent if abs(agent_q_values[s, better] - optimal_qvalue) > self.precision: if self.debug: print("wrong answer", (s, better), "should be optimal to numerical precision") return False if not agent_q_values[ (s, better)] - self.precision > agent_q_values[(s, worse)]: if self.debug: print("wrong answer", (s, better), "should be better") return False else: (s, worse), (s, better), equivalent = question print("Qw({},{}) = {}, \nQb({},{}) = {}".format( s, worse, agent_q_values[(s, worse)], s, better, agent_q_values[(s, better)])) #either way (s,better) should be optimal, so check that first optimal_action = utils.argmax(self.mdp_world.actions(s), lambda a: agent_q_values[s, a]) optimal_qvalue = agent_q_values[s, optimal_action] #if better action q-value is not numerically significantly better, then fail the agent if abs(agent_q_values[s, better] - optimal_qvalue) > self.precision: if self.debug: print("wrong answer", (s, better), "should be optimal to numerical precision") return False if equivalent: #if agent q-values are not within numerical precision of each other, then fail the agent if not abs(agent_q_values[(s, better)] - agent_q_values[(s, worse)]) < self.precision: if self.debug: print("wrong answer. Should be equal") return False else: #if better action q-value is not numerically significantly better, then fail the agent if not agent_q_values[ (s, better)] - self.precision > agent_q_values[ (s, worse)]: if self.debug: print("wrong answer.", (s, better), "should be better") return False if self.debug: print("correct answer") return True