def get_trivial_policy( mdp_obj: mdp.FiniteMarkovDecisionProcess[S, A] ) -> mdp.FinitePolicy[S, A]: """Generate a policy which randomly selects actions for each state. :param mdp_obj: Markov decision process for which to get uniform policy. :returns: Policy which assigns a uniform distribution to each action for each state. """ state_action_dict: Dict[S, A] = {} for state in mdp_obj.states(): actions = list(mdp_obj.actions(state)) if len(actions) > 0: num_actions = len(actions) uniform_prob = 1/num_actions uniform_actions = dist.Categorical( {action: uniform_prob for action in actions} ) state_action_dict[state] = uniform_actions else: state_action_dict[state] = None return mdp.FinitePolicy(state_action_dict)
def main(): """Run the prediction algorithms on the vampire problem. """ from pprint import pprint # Specify a starting state distribution for the number of villagers num_villagers: int = 10 start_state_dist: dist.Categorical[S] = dist.Categorical({ vampire.State(i, True): 1 / num_villagers for i in range(1, num_villagers + 1) }) # Represent the problem as an MDP vampire_mdp: mdp.FiniteMarkovDecisionProcess[S, A] =\ vampire.VampireMDP(num_villagers) # Use dynamic programming to obtain the optimal value function and policy true_val, pi = dp.policy_iteration_result(vampire_mdp, 1) print("True optimal value function: ") pprint(true_val) # Apply Tabular MC prediction to approximate optimal value function vampire_mrp: mp.FiniteMarkovRewardProcess[S] =\ vampire_mdp.apply_finite_policy(pi) num_traces = 1000000 traces = get_traces(vampire_mrp, start_state_dist, num_traces) pred_val_mc = tabular_mc_prediction(traces, 1) print("Predicted value function by MC prediction: ") pprint(pred_val_mc) # Apply Tabular TD prediction to approximate optimal value function atomic_experiences = [step for trace in traces for step in trace] pred_val_td = tabular_td_prediction(atomic_experiences, 0.0001, 1) print("Predicted value function by TD prediction: ") pprint(pred_val_td)
def get_random_policy( mdp_obj: mdp.FiniteMarkovDecisionProcess[S, A]) -> mdp.FinitePolicy[S, A]: """Generate a random policy for an MDP by uniform sampling of action space. This function is used to initialize the policy during MC Control. :param mdp_obj: MDP object for which random policy is being generated :returns: Random deterministic policy for MDP """ state_action_dict: Dict[S, A] = {} for state in mdp_obj.states(): actions = list(mdp_obj.actions(state)) if len(actions) > 0: num_actions = len(actions) uniform_prob = 1 / num_actions uniform_actions = dist.Categorical( {action: uniform_prob for action in actions}) random_action = uniform_actions.sample() state_action_dict[state] = dist.Constant(random_action) else: state_action_dict[state] = None return mdp.FinitePolicy(state_action_dict)
def main(): """Run the control algorithms. Test the control algorithms using the `Vampire Problem` MDP. """ # Specify a starting state distribution for the number of villagers num_villagers: int = 10 start_state_dist: dist.Categorical[S] = dist.Categorical({ vampire.State(i, True): 1 / num_villagers for i in range(1, num_villagers + 1) }) # Represent the problem as an MDP vampire_mdp: mdp.FiniteMarkovDecisionProcess[S, A] =\ vampire.VampireMDP(num_villagers) # Use dynamic programming to obtain the optimal value function and policy true_val, pi = dp.policy_iteration_result(vampire_mdp, 1) print("True optimal policy: ") print(pi) print() print("True optimal value function: ") pprint(true_val) # Apply tabular MC control to obtain the optimal policy and value function pred_action_val, pred_pi = tabular_mc_control(vampire_mdp, 1, start_state_dist, 10000) print("Predicted optimal policy: ") for i in range(1, num_villagers + 1): print("Num Villagers: " + str(i) + "; Vampire Alive: True") print(pred_pi.act((vampire.State(i, True)))) print() print("Predicted optimal action-value function: ") print_if_optimal(pred_action_val, pred_pi) # Apply tabular SARSA to obtain the optimal policy and value function pred2_action_val, pred2_pi = tabular_sarsa(vampire_mdp, 1, start_state_dist, 10000) print("Predicted optimal policy: ") for i in range(1, num_villagers + 1): print("Num Villagers: " + str(i) + "; Vampire Alive: True") print(pred2_pi.act((vampire.State(i, True)))) print() print("Predicted optimal action-value function: ") print_if_optimal(pred2_action_val, pred2_pi) # Apply tabular Q-learning to obtain the optimal policy and value function pred3_action_val, pred3_pi = tabular_qlearning(vampire_mdp, 1, start_state_dist, 100000) print("Predicted optimal policy: ") for i in range(1, num_villagers + 1): print("Num Villagers: " + str(i) + "; Vampire Alive: True") print(pred3_pi.act((vampire.State(i, True)))) print() print("Predicted optimal action-value function: ") print_if_optimal(pred3_action_val, pred3_pi)
def main(): """Run the prediction algorithms on the vampire problem. """ from pprint import pprint # Specify a starting state distribution for the number of villagers num_villagers: int = 10 start_state_dist: dist.Categorical[S] = dist.Categorical( { vampire.State( i, True ): 1 / num_villagers for i in range(1, num_villagers+1) } ) # Represent the problem as an MDP vampire_mdp: mdp.FiniteMarkovDecisionProcess[S, A] =\ vampire.VampireMDP(num_villagers) # Use dynamic programming to obtain the optimal value function and policy true_val, pi = dp.policy_iteration_result(vampire_mdp, 1) print("True optimal value function: ") pprint(true_val) # Express the vampire problem as an MRP and sample traces vampire_mrp: mp.FiniteMarkovRewardProcess[S] =\ vampire_mdp.apply_finite_policy(pi) num_traces = 100000 traces = get_traces(vampire_mrp, start_state_dist, num_traces) # Apply tabular TD-lambda to approximate optimal value function pred_val_td_lambda, _ = tabular_TD_lambda( traces=traces, learning_rate=get_learning_rate, lambda_param=0.5, gamma=1 ) print("Predicted value function by TD-lambda prediction: ") print_non_terminal_vampire_states(pred_val_td_lambda) # Apply tabular n-step boostrap to predict optimal value function pred_val_n_step, _ = tabular_n_step_bootstrap( traces=traces, learning_rate=get_learning_rate, n_step=3, gamma=1 ) print("Predicted value function by tabular n-step prediction: ") print_non_terminal_vampire_states(pred_val_n_step) # Plot Convergence of VF prediction by TD-lambda at various lambdas run_tabular_td_lambda( traces=traces, learning_rate=get_learning_rate, lambda_param=[0, 0.25, 0.5, 0.75, 0.99], gamma=1 )
def main(): """Test the LSTD algorithm on the Vampire problem MDP. """ from pprint import pprint # Specify a starting state distribution for the number of villagers num_villagers: int = 10 start_state_dist: dist.Categorical[S] = dist.Categorical( { vampire.State( i, True ): 1 / num_villagers for i in range(1, num_villagers+1) } ) # Represent the problem as an MDP vampire_mdp: mdp.FiniteMarkovDecisionProcess[S, A] =\ vampire.VampireMDP(num_villagers) # Use dynamic programming to obtain the optimal value function and policy true_val, pi = dp.policy_iteration_result(vampire_mdp, 1) print("True optimal value function: ") pprint(true_val) # Express the vampire problem as an MRP and sample experiences vampire_mrp: mp.FiniteMarkovRewardProcess[S] =\ vampire_mdp.apply_finite_policy(pi) num_traces = 10000 traces = get_traces(vampire_mrp, start_state_dist, num_traces) experiences = [trace[i] for trace in traces for i in range(len(trace))] # Generate feature vector, weights, and approx VF for non-terminal states vf = {} weights = LSTD(feature_functions, experiences, 1) for i in range(1, num_villagers+1): vampire_state = vampire.State(n=i, v=True) vf[vampire_state] = np.matmul( get_feature_vec(feature_functions, vampire_state), weights )[0] print("Predicted optimal value function: ") pprint(vf) # Generate a random set of atomic experiences from random policies random_experiences = get_traces_over_random_actions( vampire_mdp, start_state_dist, 10000 ) lstdq_weights = LSTDQ( action_feature_funcs, random_experiences, 1 ) print(lstdq_weights)
def softmax(self, state: S) -> Optional[dist.Distribution[A]]: """Generate an action distribution for a state using softmax algorithm. :param state: State from which to generate policy :param action_func: Function for generating action space from state :param feature_funcs: Functions for generating feature vector from current state and proposed action :param weights: Weights of linear function approximation for features :returns: Distribution of action probabilities from state """ actions = self.action_func(state) if actions is None: return tot_prob: float = 0 act_prob_dict = {} for a in actions: prob: float = np.exp( np.dot(get_feature_vec(self.feature_funcs, state, a), self.weights))[0] act_prob_dict[a]: Dict[A, float] = prob tot_prob += prob return dist.Categorical( {a: act_prob_dict[a] / tot_prob for a in actions})