def __init__(self,
                 mdp,
                 name="value_iter",
                 delta=0.0001,
                 max_iterations=500,
                 sample_rate=3):
        '''
        Args:
            mdp (MDP)
            delta (float): After an iteration if VI, if no change more than @\delta has occurred, terminates.
            max_iterations (int): Hard limit for number of iterations.
            sample_rate (int): Determines how many samples from @mdp to take to estimate T(s' | s, a).
            horizon (int): Number of steps before terminating.
        '''
        Planner.__init__(self, mdp, name=name)

        self.delta = delta
        self.max_iterations = max_iterations
        self.sample_rate = sample_rate
        self.value_func = defaultdict(float)
        self.reachability_done = False
        self.has_computed_matrix = False
        self.bellman_backups = 0
        self.trans_dict = defaultdict(
            lambda: defaultdict(lambda: defaultdict(float)))
    def __init__(self, mdp, name="mcts", explore_param=math.sqrt(2), rollout_depth=20, num_rollouts_per_step=10):
        Planner.__init__(self, mdp, name=name)

        self.rollout_depth = rollout_depth
        self.num_rollouts_per_step = num_rollouts_per_step
        self.value_total = defaultdict(lambda : defaultdict(float))
        self.explore_param = explore_param
        self.visitation_counts = defaultdict(lambda : defaultdict(lambda : 0))
 def __init__(self, mdp, name="dyna", max_iterations=500):
     Planner.__init__(self, mdp, name=name)
     self.max_iterations = max_iterations
     self.value_func = defaultdict(float)
     self.max_q_act_histories = defaultdict(str)
     self.reachability_done = False
     self.has_computed_matrix = False
     self.bellman_backups = 0
     self.epsilon = 0.5 #epsilon-decay should be implemented
     self.trans_dict = defaultdict(lambda:defaultdict(lambda:defaultdict(lambda: 0)))
     self.reward_dict = defaultdict(lambda:defaultdict(lambda:defaultdict(lambda: defaultdict(lambda: 0))))
     self.trans_prob = defaultdict(lambda:defaultdict(lambda:defaultdict(float)))
     self.reward_prob = defaultdict(lambda:defaultdict(lambda:defaultdict(lambda: defaultdict(float))))
     self.default_q = 0
     self.alpha=0.1 #step-size
     #initialize all Q(s,a) as zero
     self.q_func = defaultdict(lambda: defaultdict(lambda: self.default_q))
     self.N = 10
     self.previous_record = {} #keep track of previously visited s-a pairs