示例#1
0
    def __init__(self,
                 mdp_world,
                 critical_threshold,
                 precision=0.0001,
                 debug=False):
        self.mdp_world = mdp_world
        self.entropy_threshold = critical_threshold
        self.precision = precision
        self.debug = debug
        self.q_values = mdp.compute_q_values(mdp_world)
        self.optimal_policy = mdp.find_optimal_policy(mdp_world,
                                                      Q=self.q_values)

        #find critical states
        if debug:
            print("finding critical states")
        self.critical_state_actions = []
        for s in self.mdp_world.states:
            if debug:
                print(s)
            #calculate entropy of optimal policy (assumes it is stochastic optimal)
            num_optimal_actions = len(self.optimal_policy[s])
            action_probs = np.zeros(len(self.mdp_world.actions(s)))
            for i in range(num_optimal_actions):
                action_probs[i] = 1.0 / num_optimal_actions
            entropy = utils.entropy(action_probs)
            if debug:
                print(s, entropy)
            best_action = utils.argmax(self.mdp_world.actions(s),
                                       lambda a: self.q_values[s, a])
            if entropy < self.entropy_threshold:
                self.critical_state_actions.append((s, best_action))
示例#2
0
 def show_qtable(self):
     table = np.chararray((12, 12))
     for i in range(self.n_states):
         if (max(self.qtable[i])) != 0:
             table[i // 12, i % 12] = BST_DIRECTIONS[argmax(self.qtable[i])]
         else:
             table[i // 12, i % 12] = "N"
     print(table)
     print(self.epsilon)
示例#3
0
文件: qa.py 项目: max-andr/deepanswer
 def categorize(self):
     scores = []
     for qtype in self.question_types:
         patterns = qtype.get_pattern()
         score = self._score_pattern_matching(self.text_ru, patterns)
         scores.append(score)
     arg_i = utils.argmax(scores)
     categorized_qtype = self.question_types[arg_i]
     return categorized_qtype(self.text_ru)
示例#4
0
 def epsilonGreedy(self, q_values):
     a = argmax(q_values)
     if self.random_state.uniform(0, 1) < self.epsilon:
         a = self.random_state.randint(self.n_actions)
     return a
示例#5
0
 def predict(self, state):
     """
     Greedily select action for a state
     """
     q_values = self.qtable[state]
     return argmax(q_values), ''  # for compatibilty
    def is_agent_value_aligned(self, policy, agent_q_values, reward_weights):

        #Need to ask the agent what it would do in each setting. Need access to agent Q-values...
        for question in self.test:
            if self.debug:
                print("Testing question:")
                utils.print_question(question, self.mdp_world)

            if len(question) == 2:
                (s, worse), (s, better) = question
                if self.debug:
                    print("Qw({},{}) = {}, \nQb({},{}) = {}".format(
                        s, worse, agent_q_values[(s, worse)], s, better,
                        agent_q_values[(s, better)]))
                #check if q-values match question answer
                #check if better action is optimal
                optimal_action = utils.argmax(self.mdp_world.actions(s),
                                              lambda a: agent_q_values[s, a])
                optimal_qvalue = agent_q_values[s, optimal_action]
                #if better action q-value is not numerically significantly better, then fail the agent
                if abs(agent_q_values[s, better] -
                       optimal_qvalue) > self.precision:
                    if self.debug:
                        print("wrong answer", (s, better),
                              "should be optimal to numerical precision")
                    return False
                if not agent_q_values[
                    (s, better)] - self.precision > agent_q_values[(s, worse)]:
                    if self.debug:
                        print("wrong answer", (s, better), "should be better")
                    return False
            else:
                (s, worse), (s, better), equivalent = question
                print("Qw({},{}) = {}, \nQb({},{}) = {}".format(
                    s, worse, agent_q_values[(s, worse)], s, better,
                    agent_q_values[(s, better)]))

                #either way (s,better) should be optimal, so check that first
                optimal_action = utils.argmax(self.mdp_world.actions(s),
                                              lambda a: agent_q_values[s, a])
                optimal_qvalue = agent_q_values[s, optimal_action]
                #if better action q-value is not numerically significantly better, then fail the agent
                if abs(agent_q_values[s, better] -
                       optimal_qvalue) > self.precision:
                    if self.debug:
                        print("wrong answer", (s, better),
                              "should be optimal to numerical precision")
                    return False

                if equivalent:
                    #if agent q-values are not within numerical precision of each other, then fail the agent
                    if not abs(agent_q_values[(s, better)] -
                               agent_q_values[(s, worse)]) < self.precision:
                        if self.debug:
                            print("wrong answer. Should be equal")
                        return False
                else:
                    #if better action q-value is not numerically significantly better, then fail the agent
                    if not agent_q_values[
                        (s, better)] - self.precision > agent_q_values[
                            (s, worse)]:
                        if self.debug:
                            print("wrong answer.", (s, better),
                                  "should be better")
                        return False
            if self.debug:
                print("correct answer")
        return True