def __init__(self, mdp_world, Q, opt_policy, precision, debug=False, use_suboptimal_rankings=False, epsilon_gap=0.0, teacher=None, tests=None, halfspaces=None): self.mdp_world = mdp_world self.precision = precision self.debug = debug self.epsilon_gap = epsilon_gap if teacher: teacher = teacher else: teacher = machine_teaching.StateActionRankingTeacher( mdp_world, Q, opt_policy, debug=self.debug, epsilon=precision) #TODO: we don't need the tests, just the halfspaces, but we do need to know which are equality if tests and halfspaces: tests, self.halfspaces = tests, halfspaces else: tests, self.halfspaces = teacher.get_optimal_value_alignment_tests( use_suboptimal_rankings=False, compare_optimal=False, epsilon_gap=self.epsilon_gap) #for now let's just select the first question for each halfspace self.test = [questions[0] for questions in tests]
def __init__(self, mdp_world, Q, opt_policy, precision, debug=False, remove_redundancy_lp=True, teacher=None, tests=None, halfspaces=None): self.mdp_world = mdp_world self.precision = precision self.debug = debug self.q_values = Q #mdp.compute_q_values(mdp_world, eps = precision) self.optimal_policy = opt_policy #mdp.find_optimal_policy(mdp_world, Q=self.q_values, epsilon=precision) if teacher: teacher = teacher else: teacher = machine_teaching.StateActionRankingTeacher( mdp_world, Q, opt_policy, debug=self.debug, epsilon=precision) #TODO: we don't need the tests, just the halfspaces, but we do need to know which are equality if tests and halfspaces: self.tests, self.halfspaces = tests, halfspaces else: self.tests, self.halfspaces = teacher.get_optimal_value_alignment_tests( use_suboptimal_rankings=False, compare_optimal=False) # teacher = machine_teaching.StateActionRankingTeacher(mdp_world, Q, opt_policy, debug=self.debug, remove_redundancy_lp = remove_redundancy_lp, epsilon=precision) # self.tests, self.halfspaces = teacher.get_optimal_value_alignment_tests(use_suboptimal_rankings = False) #for now let's just select the first question for each halfspace self.test = [questions[0] for questions in self.tests]
def __init__(self, mdp_world, precision, debug=False, remove_redundancy_lp = True): self.mdp_world = mdp_world self.precision = precision self.debug = debug self.q_values = mdp.compute_q_values(mdp_world, eps = precision) self.optimal_policy = mdp.find_optimal_policy(mdp_world, Q=self.q_values, epsilon=precision) teacher = machine_teaching.StateActionRankingTeacher(mdp_world, debug=self.debug, remove_redundancy_lp = remove_redundancy_lp, epsilon=precision) tests, _ = teacher.get_optimal_value_alignment_tests(use_suboptimal_rankings = False) #for now let's just select the first question for each halfspace self.test = [questions[0] for questions in tests]
def __init__(self, mdp_world, precision, debug=False): self.mdp_world = mdp_world self.precision = precision self.debug = debug teacher = machine_teaching.StateActionRankingTeacher(mdp_world, debug=self.debug, epsilon=precision) tests, _ = teacher.get_optimal_value_alignment_tests( use_suboptimal_rankings=False) #for now let's just select the first question for each halfspace self.test = [questions[0] for questions in tests]
eval_weights.append(eval_weight_vector) num_eval_policies += 1 print("There are {} distinct optimal policies".format( len(eval_policies))) if len(eval_policies) == 0: print( "The only possible policy is the optimal policy. There must be a problem with the features. Can't do verification if only on policy possible!" ) sys.exit() print() print("Generating verification tests") #TODO: save computation by solving for halfspaces once for ARP-w and ARP-bb teacher = machine_teaching.StateActionRankingTeacher( true_world, Qopt, opt_policy, debug=debug, epsilon=precision) #TODO: we don't need the tests, just the halfspaces, but we do need to know which are equality tests, halfspaces = teacher.get_optimal_value_alignment_tests( use_suboptimal_rankings=False, compare_optimal=False) for vindx, verifier_name in enumerate(verifier_list): tester = None size_verification_test = None if "state-value-critical-" in verifier_name: critical_value_thresh = float( verifier_name[len("state-value-critical-"):]) #print("critical value", critical_value_thresh) tester = ah.CriticalStateActionValueVerifier( true_world,
eval_policy = mdp.find_optimal_policy(rand_world, Q=Qval, epsilon=precision) #only save if not equal to optimal policy if eval_policy not in eval_policies: if debug: print("found distinct eval policy") print("weights", eval_weight_vector) rand_world.print_map(rand_world.to_arrows(eval_policy)) eval_policies.append(eval_policy) eval_Qvalues.append(Qval) eval_weights.append(eval_weight_vector) teacher = machine_teaching.StateActionRankingTeacher(rand_world, debug=False, epsilon=precision) tests, halfspaces = teacher.get_optimal_value_alignment_tests( use_suboptimal_rankings=False) eval_halfspaces.append(halfspaces) print(halfspaces) #add all the normal vectors to a big list for getting edges later for h in halfspaces: all_halfspaces.append(h) num_eval_policies += 1 print("There are {} distinct optimal policies when sampling randomly".format( len(eval_policies)))