Пример #1
0
def get_trivial_policy(
    mdp_obj: mdp.FiniteMarkovDecisionProcess[S, A]
) -> mdp.FinitePolicy[S, A]:
    """Generate a policy which randomly selects actions for each state.

    :param mdp_obj: Markov decision process for which to get uniform policy.
    :returns: Policy which assigns a uniform distribution to each action for
        each state.
    """
    state_action_dict: Dict[S, A] = {}

    for state in mdp_obj.states():
        actions = list(mdp_obj.actions(state))

        if len(actions) > 0:
            num_actions = len(actions)
            uniform_prob = 1/num_actions
            uniform_actions = dist.Categorical(
                {action: uniform_prob for action in actions}
            )
            state_action_dict[state] = uniform_actions
        else:
            state_action_dict[state] = None

    return mdp.FinitePolicy(state_action_dict)
Пример #2
0
def main():
    """Run the prediction algorithms on the vampire problem.
    """
    from pprint import pprint

    # Specify a starting state distribution for the number of villagers
    num_villagers: int = 10
    start_state_dist: dist.Categorical[S] = dist.Categorical({
        vampire.State(i, True): 1 / num_villagers
        for i in range(1, num_villagers + 1)
    })

    # Represent the problem as an MDP
    vampire_mdp: mdp.FiniteMarkovDecisionProcess[S, A] =\
        vampire.VampireMDP(num_villagers)

    # Use dynamic programming to obtain the optimal value function and policy
    true_val, pi = dp.policy_iteration_result(vampire_mdp, 1)
    print("True optimal value function: ")
    pprint(true_val)

    # Apply Tabular MC prediction to approximate optimal value function
    vampire_mrp: mp.FiniteMarkovRewardProcess[S] =\
        vampire_mdp.apply_finite_policy(pi)
    num_traces = 1000000
    traces = get_traces(vampire_mrp, start_state_dist, num_traces)
    pred_val_mc = tabular_mc_prediction(traces, 1)
    print("Predicted value function by MC prediction: ")
    pprint(pred_val_mc)

    # Apply Tabular TD prediction to approximate optimal value function
    atomic_experiences = [step for trace in traces for step in trace]
    pred_val_td = tabular_td_prediction(atomic_experiences, 0.0001, 1)
    print("Predicted value function by TD prediction: ")
    pprint(pred_val_td)
def get_random_policy(
        mdp_obj: mdp.FiniteMarkovDecisionProcess[S,
                                                 A]) -> mdp.FinitePolicy[S, A]:
    """Generate a random policy for an MDP by uniform sampling of action space.

    This function is used to initialize the policy during MC Control.

    :param mdp_obj: MDP object for which random policy is being generated
    :returns: Random deterministic policy for MDP
    """
    state_action_dict: Dict[S, A] = {}

    for state in mdp_obj.states():
        actions = list(mdp_obj.actions(state))

        if len(actions) > 0:
            num_actions = len(actions)
            uniform_prob = 1 / num_actions
            uniform_actions = dist.Categorical(
                {action: uniform_prob
                 for action in actions})
            random_action = uniform_actions.sample()
            state_action_dict[state] = dist.Constant(random_action)
        else:
            state_action_dict[state] = None

    return mdp.FinitePolicy(state_action_dict)
def main():
    """Run the control algorithms.

    Test the control algorithms using the `Vampire Problem` MDP.
    """

    # Specify a starting state distribution for the number of villagers
    num_villagers: int = 10
    start_state_dist: dist.Categorical[S] = dist.Categorical({
        vampire.State(i, True): 1 / num_villagers
        for i in range(1, num_villagers + 1)
    })

    # Represent the problem as an MDP
    vampire_mdp: mdp.FiniteMarkovDecisionProcess[S, A] =\
        vampire.VampireMDP(num_villagers)

    # Use dynamic programming to obtain the optimal value function and policy
    true_val, pi = dp.policy_iteration_result(vampire_mdp, 1)
    print("True optimal policy: ")
    print(pi)
    print()
    print("True optimal value function: ")
    pprint(true_val)

    # Apply tabular MC control to obtain the optimal policy and value function
    pred_action_val, pred_pi = tabular_mc_control(vampire_mdp, 1,
                                                  start_state_dist, 10000)
    print("Predicted optimal policy: ")
    for i in range(1, num_villagers + 1):
        print("Num Villagers: " + str(i) + "; Vampire Alive: True")
        print(pred_pi.act((vampire.State(i, True))))
    print()
    print("Predicted optimal action-value function: ")
    print_if_optimal(pred_action_val, pred_pi)

    # Apply tabular SARSA to obtain the optimal policy and value function
    pred2_action_val, pred2_pi = tabular_sarsa(vampire_mdp, 1,
                                               start_state_dist, 10000)
    print("Predicted optimal policy: ")
    for i in range(1, num_villagers + 1):
        print("Num Villagers: " + str(i) + "; Vampire Alive: True")
        print(pred2_pi.act((vampire.State(i, True))))
    print()
    print("Predicted optimal action-value function: ")
    print_if_optimal(pred2_action_val, pred2_pi)

    # Apply tabular Q-learning to obtain the optimal policy and value function
    pred3_action_val, pred3_pi = tabular_qlearning(vampire_mdp, 1,
                                                   start_state_dist, 100000)
    print("Predicted optimal policy: ")
    for i in range(1, num_villagers + 1):
        print("Num Villagers: " + str(i) + "; Vampire Alive: True")
        print(pred3_pi.act((vampire.State(i, True))))
    print()
    print("Predicted optimal action-value function: ")
    print_if_optimal(pred3_action_val, pred3_pi)
def main():
    """Run the prediction algorithms on the vampire problem.
    """
    from pprint import pprint

    # Specify a starting state distribution for the number of villagers
    num_villagers: int = 10
    start_state_dist: dist.Categorical[S] = dist.Categorical(
        {
            vampire.State(
                i, True
            ): 1 / num_villagers for i in range(1, num_villagers+1)
        }
    )

    # Represent the problem as an MDP
    vampire_mdp: mdp.FiniteMarkovDecisionProcess[S, A] =\
        vampire.VampireMDP(num_villagers)

    # Use dynamic programming to obtain the optimal value function and policy
    true_val, pi = dp.policy_iteration_result(vampire_mdp, 1)
    print("True optimal value function: ")
    pprint(true_val)

    # Express the vampire problem as an MRP and sample traces
    vampire_mrp: mp.FiniteMarkovRewardProcess[S] =\
        vampire_mdp.apply_finite_policy(pi)
    num_traces = 100000
    traces = get_traces(vampire_mrp, start_state_dist, num_traces)

    # Apply tabular TD-lambda to approximate optimal value function
    pred_val_td_lambda, _ = tabular_TD_lambda(
        traces=traces,
        learning_rate=get_learning_rate,
        lambda_param=0.5,
        gamma=1
    )
    print("Predicted value function by TD-lambda prediction: ")
    print_non_terminal_vampire_states(pred_val_td_lambda)

    # Apply tabular n-step boostrap to predict optimal value function
    pred_val_n_step, _ = tabular_n_step_bootstrap(
        traces=traces,
        learning_rate=get_learning_rate,
        n_step=3,
        gamma=1
    )
    print("Predicted value function by tabular n-step prediction: ")
    print_non_terminal_vampire_states(pred_val_n_step)

    # Plot Convergence of VF prediction by TD-lambda at various lambdas
    run_tabular_td_lambda(
        traces=traces,
        learning_rate=get_learning_rate,
        lambda_param=[0, 0.25, 0.5, 0.75, 0.99],
        gamma=1
    )
Пример #6
0
def main():
    """Test the LSTD algorithm on the Vampire problem MDP.
    """
    from pprint import pprint

    # Specify a starting state distribution for the number of villagers
    num_villagers: int = 10
    start_state_dist: dist.Categorical[S] = dist.Categorical(
        {
            vampire.State(
                i, True
            ): 1 / num_villagers for i in range(1, num_villagers+1)
        }
    )

    # Represent the problem as an MDP
    vampire_mdp: mdp.FiniteMarkovDecisionProcess[S, A] =\
        vampire.VampireMDP(num_villagers)

    # Use dynamic programming to obtain the optimal value function and policy
    true_val, pi = dp.policy_iteration_result(vampire_mdp, 1)
    print("True optimal value function: ")
    pprint(true_val)

    # Express the vampire problem as an MRP and sample experiences
    vampire_mrp: mp.FiniteMarkovRewardProcess[S] =\
        vampire_mdp.apply_finite_policy(pi)
    num_traces = 10000
    traces = get_traces(vampire_mrp, start_state_dist, num_traces)
    experiences = [trace[i] for trace in traces for i in range(len(trace))]

    # Generate feature vector, weights, and approx VF for non-terminal states
    vf = {}
    weights = LSTD(feature_functions, experiences, 1)

    for i in range(1, num_villagers+1):
        vampire_state = vampire.State(n=i, v=True)
        vf[vampire_state] = np.matmul(
            get_feature_vec(feature_functions, vampire_state), weights
        )[0]
    print("Predicted optimal value function: ")
    pprint(vf)

    # Generate a random set of atomic experiences from random policies
    random_experiences = get_traces_over_random_actions(
        vampire_mdp,
        start_state_dist,
        10000
    )
    lstdq_weights = LSTDQ(
        action_feature_funcs, random_experiences, 1
    )
    print(lstdq_weights)
Пример #7
0
    def softmax(self, state: S) -> Optional[dist.Distribution[A]]:
        """Generate an action distribution for a state using softmax algorithm.

        :param state: State from which to generate policy
        :param action_func: Function for generating action space from state
        :param feature_funcs: Functions for generating feature vector from
            current state and proposed action
        :param weights: Weights of linear function approximation for features
        :returns: Distribution of action probabilities from state
        """
        actions = self.action_func(state)
        if actions is None:
            return
        tot_prob: float = 0
        act_prob_dict = {}
        for a in actions:
            prob: float = np.exp(
                np.dot(get_feature_vec(self.feature_funcs, state, a),
                       self.weights))[0]
            act_prob_dict[a]: Dict[A, float] = prob
            tot_prob += prob
        return dist.Categorical(
            {a: act_prob_dict[a] / tot_prob
             for a in actions})