Exemplo n.º 1
0
def glie_mc_finite_learning_rate_correctness(
        fmdp: FiniteMarkovDecisionProcess[S, A], initial_learning_rate: float,
        half_life: float, exponent: float, gamma: float,
        epsilon_as_func_of_episodes: Callable[[int], float],
        episode_length_tolerance: float, num_episodes: int) -> None:
    qvfs: Iterator[QValueFunctionApprox[S, A]] = \
        glie_mc_finite_control_learning_rate(
            fmdp=fmdp,
            initial_learning_rate=initial_learning_rate,
            half_life=half_life,
            exponent=exponent,
            gamma=gamma,
            epsilon_as_func_of_episodes=epsilon_as_func_of_episodes,
            episode_length_tolerance=episode_length_tolerance
        )
    final_qvf: QValueFunctionApprox[S, A] = \
        iterate.last(itertools.islice(qvfs, num_episodes))
    opt_vf, opt_policy = get_vf_and_policy_from_qvf(mdp=fmdp, qvf=final_qvf)

    print(f"GLIE MC Optimal Value Function with {num_episodes:d} episodes")
    pprint(opt_vf)
    print(f"GLIE MC Optimal Policy with {num_episodes:d} episodes")
    print(opt_policy)

    true_opt_vf, true_opt_policy = value_iteration_result(fmdp, gamma=gamma)

    print("True Optimal Value Function")
    pprint(true_opt_vf)
    print("True Optimal Policy")
    print(true_opt_policy)
Exemplo n.º 2
0
 def get_vi_vf_and_policy(self) -> Tuple[V[Cell], FinitePolicy[Cell, Move]]:
     '''
     Performs the Value Iteration DP algorithm returning the
     Optimal Value Function (as a V[Cell]) and the Optimal Policy
     (as a FinitePolicy[Cell, Move])
     '''
     return value_iteration_result(self.get_finite_mdp(), gamma=1.)
Exemplo n.º 3
0
def q_learning_finite_learning_rate_correctness(
    fmdp: FiniteMarkovDecisionProcess[S, A],
    initial_learning_rate: float,
    half_life: float,
    exponent: float,
    gamma: float,
    epsilon: float,
    max_episode_length: int,
    num_updates: int,
) -> None:
    qvfs: Iterator[QValueFunctionApprox[S, A]] = \
        q_learning_finite_learning_rate(
            fmdp=fmdp,
            initial_learning_rate=initial_learning_rate,
            half_life=half_life,
            exponent=exponent,
            gamma=gamma,
            epsilon=epsilon,
            max_episode_length=max_episode_length
        )
    final_qvf: QValueFunctionApprox[S, A] = \
        iterate.last(itertools.islice(qvfs, num_updates))
    opt_vf, opt_policy = get_vf_and_policy_from_qvf(mdp=fmdp, qvf=final_qvf)

    print(f"Q-Learning ptimal Value Function with {num_updates:d} updates")
    pprint(opt_vf)
    print(f"Q-Learning Optimal Policy with {num_updates:d} updates")
    print(opt_policy)

    true_opt_vf, true_opt_policy = value_iteration_result(fmdp, gamma=gamma)

    print("True Optimal Value Function")
    pprint(true_opt_vf)
    print("True Optimal Policy")
    print(true_opt_policy)
Exemplo n.º 4
0
    def test_value_iteration(self):
        mdp_map: Mapping[NonTerminal[InventoryState],
                         float] = value_iteration_result(
                             self.si_mdp, self.gamma)[0]
        # print(mdp_map)
        mdp_vf1: np.ndarray = np.array([mdp_map[s] for s in self.states])

        fa = Dynamic({s: 0.0 for s in self.states})
        mdp_finite_fa = iterate.converged(value_iteration_finite(
            self.si_mdp, self.gamma, fa),
                                          done=lambda a, b: a.within(b, 1e-5))
        # print(mdp_finite_fa.values_map)
        mdp_vf2: np.ndarray = mdp_finite_fa.evaluate(self.states)

        self.assertLess(max(abs(mdp_vf1 - mdp_vf2)), 0.01)

        mdp_fa = iterate.converged(value_iteration(self.si_mdp,
                                                   self.gamma,
                                                   fa,
                                                   Choose(self.states),
                                                   num_state_samples=30),
                                   done=lambda a, b: a.within(b, 1e-5))
        # print(mdp_fa.values_map)
        mdp_vf3: np.ndarray = mdp_fa.evaluate(self.states)
        self.assertLess(max(abs(mdp_vf1 - mdp_vf3)), 0.01)
Exemplo n.º 5
0
def process_time(n,gamma = 1) -> Tuple[float,float,float]:
    print(f"n={n}")
    model = LilypadModel(n)
    start = time.time()
    list_policies = get_policies(n)
    optimal_policy,list_sum,list_values,idx_max = get_optimal_policy(n,model,list_policies,gamma = gamma) 
    time_brute = time.time()-start
    start_2 = time.time()
    value_iter = value_iteration_result(model,1)
    time_value_iter = time.time() - start_2
    start_3 = time.time()
    policy_iter = policy_iteration_result(model,1)
    time_policy_iter = time.time() - start_3
    return time_brute,time_value_iter,time_policy_iter
Exemplo n.º 6
0
    def test_value_iteration(self):
        mdp_map: Mapping[InventoryState, float] = value_iteration_result(
            self.si_mdp, self.gamma)[0]
        # print(mdp_map)
        mdp_vf1: np.ndarray = np.array([mdp_map[s] for s in self.states])

        fa = Dynamic({s: 0.0 for s in self.states})
        mdp_finite_fa = FunctionApprox.converged(
            value_iteration_finite(self.si_mdp, self.gamma, fa))
        # print(mdp_finite_fa.values_map)
        mdp_vf2: np.ndarray = mdp_finite_fa.evaluate(self.states)

        self.assertLess(max(abs(mdp_vf1 - mdp_vf2)), 0.001)

        mdp_fa = FunctionApprox.converged(
            value_iteration(self.si_mdp,
                            self.gamma,
                            fa,
                            Choose(self.states),
                            num_state_samples=30), 0.1)
        # print(mdp_fa.values_map)
        mdp_vf3: np.ndarray = mdp_fa.evaluate(self.states)
        self.assertLess(max(abs(mdp_vf1 - mdp_vf3)), 1.0)
Exemplo n.º 7
0
        for state in si_mdp.non_terminal_states
    })

    mc_tabular_control = mc_control(si_mdp, start_states,
                                    Tabular(start_map, start_map), user_gamma,
                                    800)
    values_map = mc_tabular_control.values_map
    opt_vf, opt_pi = get_optimal_policy(values_map)
    print('opt_vf mc control: \n', opt_vf, '\nopt_pi mc control: \n', opt_pi)

    fdp: FinitePolicy[InventoryState, int] = FinitePolicy({
        InventoryState(alpha, beta): Constant(user_capacity - (alpha + beta))
        for alpha in range(user_capacity + 1)
        for beta in range(user_capacity + 1 - alpha)
    })
    implied_mrp: FiniteMarkovRewardProcess[InventoryState] = \
        si_mdp.apply_finite_policy(fdp)

    print("MDP Value Iteration Optimal Value Function and Optimal Policy")
    print("--------------")
    opt_vf_vi, opt_policy_vi = value_iteration_result(si_mdp, gamma=user_gamma)
    print(opt_vf_vi, '\n')
    print(opt_policy_vi)

    print("MDP Policy Iteration Optimal Value Function and Optimal Policy")
    print("--------------")
    opt_vf_pi, opt_policy_pi = policy_iteration_result(si_mdp,
                                                       gamma=user_gamma)
    print(opt_vf_pi, '\n')
    print(opt_policy_pi)
Exemplo n.º 8
0
        t2 = time.time()
        time_brute_force = t2 - t1
        y_brute.append(time_brute_force)

        # Policy Iteration
        t1 = time.time()
        opt_vf_pi, opt_policy_pi = policy_iteration_result(frog_mdp, gamma=1)
        t2 = time.time()
        time_policy_iter = t2 - t1
        y_pi.append(time_policy_iter)
        #pprint(opt_vf_pi)
        #print(opt_policy_pi)

        # Value Iteration
        t1 = time.time()
        opt_vf_pi, opt_policy_pi = value_iteration_result(frog_mdp, gamma=1)
        t2 = time.time()
        time_value_iter = t2 - t1
        y_vi.append(time_value_iter)
        #pprint(opt_vf_pi)
        #print(opt_policy_pi)

    plt.plot(x, y_brute, c='r', label='Brute Force')
    plt.plot(x, y_pi, c='b', label='Policy Iteration')
    plt.plot(x, y_vi, c='g', label='Value Iteration')
    plt.xlabel('Number of Lilypads')
    plt.ylabel('Time till Convergence')
    plt.legend()
    plt.show()

Exemplo n.º 9
0
def compare_mc_sarsa_ql(fmdp: FiniteMarkovDecisionProcess[S, A],
                        method_mask: Tuple[bool, bool, bool],
                        learning_rates: Sequence[Tuple[float, float,
                                                       float]], gamma: float,
                        epsilon_as_func_of_episodes: Callable[[int], float],
                        q_learning_epsilon: float,
                        mc_episode_length_tol: float, num_episodes: int,
                        plot_batch: int, plot_start: int) -> None:
    true_vf: V[S] = value_iteration_result(fmdp, gamma)[0]
    states: Sequence[NonTerminal[S]] = fmdp.non_terminal_states
    colors: Sequence[str] = ['b', 'g', 'r', 'k', 'c', 'm', 'y']

    import matplotlib.pyplot as plt
    plt.figure(figsize=(11, 7))

    if method_mask[0]:
        for k, (init_lr, half_life, exponent) in enumerate(learning_rates):
            mc_funcs_it: Iterator[QValueFunctionApprox[S, A]] = \
                glie_mc_finite_control_learning_rate(
                    fmdp=fmdp,
                    initial_learning_rate=init_lr,
                    half_life=half_life,
                    exponent=exponent,
                    gamma=gamma,
                    epsilon_as_func_of_episodes=epsilon_as_func_of_episodes,
                    episode_length_tolerance=mc_episode_length_tol
                )
            mc_errors = []
            batch_mc_errs = []
            for i, mc_qvf in enumerate(
                    itertools.islice(mc_funcs_it, num_episodes)):
                mc_vf: V[S] = {
                    s: max(mc_qvf((s, a)) for a in fmdp.actions(s))
                    for s in states
                }
                batch_mc_errs.append(
                    sqrt(
                        sum((mc_vf[s] - true_vf[s])**2
                            for s in states) / len(states)))
                if i % plot_batch == plot_batch - 1:
                    mc_errors.append(sum(batch_mc_errs) / plot_batch)
                    batch_mc_errs = []
            mc_plot = mc_errors[plot_start:]
            label = f"MC InitRate={init_lr:.3f},HalfLife" + \
                f"={half_life:.0f},Exp={exponent:.1f}"
            plt.plot(range(len(mc_plot)),
                     mc_plot,
                     color=colors[k],
                     linestyle='-',
                     label=label)

    sample_episodes: int = 1000
    uniform_policy: FinitePolicy[S, A] = \
        FinitePolicy(
            {s.state: Choose(fmdp.actions(s)) for s in states}
    )
    fmrp: FiniteMarkovRewardProcess[S] = \
        fmdp.apply_finite_policy(uniform_policy)
    td_episode_length: int = int(
        round(
            sum(
                len(
                    list(
                        returns(trace=fmrp.simulate_reward(Choose(states)),
                                γ=gamma,
                                tolerance=mc_episode_length_tol)))
                for _ in range(sample_episodes)) / sample_episodes))

    if method_mask[1]:
        for k, (init_lr, half_life, exponent) in enumerate(learning_rates):
            sarsa_funcs_it: Iterator[QValueFunctionApprox[S, A]] = \
                glie_sarsa_finite_learning_rate(
                    fmdp=fmdp,
                    initial_learning_rate=init_lr,
                    half_life=half_life,
                    exponent=exponent,
                    gamma=gamma,
                    epsilon_as_func_of_episodes=epsilon_as_func_of_episodes,
                    max_episode_length=td_episode_length,
                )
            sarsa_errors = []
            transitions_batch = plot_batch * td_episode_length
            batch_sarsa_errs = []

            for i, sarsa_qvf in enumerate(
                    itertools.islice(sarsa_funcs_it,
                                     num_episodes * td_episode_length)):
                sarsa_vf: V[S] = {
                    s: max(sarsa_qvf((s, a)) for a in fmdp.actions(s))
                    for s in states
                }
                batch_sarsa_errs.append(
                    sqrt(
                        sum((sarsa_vf[s] - true_vf[s])**2
                            for s in states) / len(states)))
                if i % transitions_batch == transitions_batch - 1:
                    sarsa_errors.append(
                        sum(batch_sarsa_errs) / transitions_batch)
                    batch_sarsa_errs = []
            sarsa_plot = sarsa_errors[plot_start:]
            label = f"SARSA InitRate={init_lr:.3f},HalfLife" + \
                f"={half_life:.0f},Exp={exponent:.1f}"
            plt.plot(range(len(sarsa_plot)),
                     sarsa_plot,
                     color=colors[k],
                     linestyle='--',
                     label=label)

    if method_mask[2]:
        for k, (init_lr, half_life, exponent) in enumerate(learning_rates):
            ql_funcs_it: Iterator[QValueFunctionApprox[S, A]] = \
                q_learning_finite_learning_rate(
                    fmdp=fmdp,
                    initial_learning_rate=init_lr,
                    half_life=half_life,
                    exponent=exponent,
                    gamma=gamma,
                    epsilon=q_learning_epsilon,
                    max_episode_length=td_episode_length,
                )
            ql_errors = []
            transitions_batch = plot_batch * td_episode_length
            batch_ql_errs = []

            for i, ql_qvf in enumerate(
                    itertools.islice(ql_funcs_it,
                                     num_episodes * td_episode_length)):
                ql_vf: V[S] = {
                    s: max(ql_qvf((s, a)) for a in fmdp.actions(s))
                    for s in states
                }
                batch_ql_errs.append(
                    sqrt(
                        sum((ql_vf[s] - true_vf[s])**2
                            for s in states) / len(states)))
                if i % transitions_batch == transitions_batch - 1:
                    ql_errors.append(sum(batch_ql_errs) / transitions_batch)
                    batch_ql_errs = []
            ql_plot = ql_errors[plot_start:]
            label = f"Q-Learning InitRate={init_lr:.3f},HalfLife" + \
                f"={half_life:.0f},Exp={exponent:.1f}"
            plt.plot(range(len(ql_plot)),
                     ql_plot,
                     color=colors[k],
                     linestyle=':',
                     label=label)

    plt.xlabel("Episode Batches", fontsize=20)
    plt.ylabel("Optimal Value Function RMSE", fontsize=20)
    plt.title("RMSE as function of episode batches", fontsize=20)
    plt.grid(True)
    plt.legend(fontsize=10)
    plt.show()
Exemplo n.º 10
0
    user_stockout_cost2 = 15.0

    store1: FiniteMarkovDecisionProcess[InventoryState, int] =\
        SimpleInventoryMDPCap(
            capacity=user_capacity,
            poisson_lambda=user_poisson_lambda,
            holding_cost=user_holding_cost,
            stockout_cost=user_stockout_cost
        ) 
    store2: FiniteMarkovDecisionProcess[InventoryState, int] =\
        SimpleInventoryMDPCap(
            capacity=user_capacity2,
            poisson_lambda=user_poisson_lambda2,
            holding_cost=user_holding_cost2,
            stockout_cost=user_stockout_cost2
        )
    K1 = 1
    K2 = 1
    problem4 = ComplexMDP(store1 = store1,
                          store2 = store2,
                          K1 = K1,
                          K2 = K2
                          )
    value_opt = value_iteration_result(problem4,user_gamma)
    policy_opt = policy_iteration_result(problem4,user_gamma)
    
    
    
    
    
    
Exemplo n.º 11
0
            b = next(v, None)
            if max(abs(a[s] - b[s]) for s in a) < TOLERANCE:
                break
            a = b
            count += 1

        opt_policy: FinitePolicy[Coordinate, Action] = greedy_policy_from_vf(
            model, b, gamma)
        return count, b, opt_policy

    start = time.time()
    count1, opt_vf1, opt_pol1 = solution(model1, 0.8)
    print(f"Method 1 took {time.time()-start} to converge")
    start = time.time()
    count2, opt_vf2, opt_pol2 = solution(model2, 1)
    print(f"Method 2 took {time.time()-start} to converge")
    print(f"Solution 1 took {count1} iterations to converge")
    print(f"Solution 2 took {count2} iterations to converge")
    print(opt_pol1)
    print(opt_pol2)

    #This is a fast solution where we don't track
    #the number of iterations to converge
    #We're using a built-in function of rl.dynamic_programming here
    start = time.time()
    opt_vf1, opt_pol1 = value_iteration_result(model1, 0.8)
    print(f"Method 1 took {time.time()-start} to converge")
    start = time.time()
    opt_vf2, opt_pol2 = value_iteration_result(model2, 1)
    print(f"Method 2 took {time.time()-start} to converge")
    q_learning_experience_replay(
        mdp=si_mdp,
        policy_from_q=lambda f, m: epsilon_greedy_policy(
            q=f,
            mdp=m,
            ϵ=epsilon
        ),
        states=Choose(si_mdp.non_terminal_states),
        approx_0=Tabular(
            count_to_weight_func=learning_rate_schedule(
                initial_learning_rate=initial_learning_rate,
                half_life=learning_rate_half_life,
                exponent=learning_rate_exponent
            )
        ),
        γ=gamma,
        max_episode_length=episode_length,
        mini_batch_size=mini_batch_size,
        weights_decay_half_life=time_decay_half_life
    )

qvf: QValueFunctionApprox[InventoryState, int] = iterate.last(
    itertools.islice(q_iter, num_updates))
vf, pol = get_vf_and_policy_from_qvf(mdp=si_mdp, qvf=qvf)
pprint(vf)
print(pol)

true_vf, true_pol = value_iteration_result(mdp=si_mdp, gamma=gamma)
pprint(true_vf)
print(true_pol)
Exemplo n.º 13
0
 def vi_vf_and_policy(self) -> \
         Tuple[V[int], FiniteDeterministicPolicy[int, int]]:
     return value_iteration_result(self, 1.0)
Exemplo n.º 14
0
                        self.W - state.wage - 1, pois_mean)
                    submapping[action] = Categorical(dic_distrib)
            mapping[state] = submapping
        return mapping


if __name__ == '__main__':
    H = 10
    W = 30
    alpha = 0.08
    beta = 0.82
    gamma = 0.95
    print("Defining the model")
    model = Problem3(H, W, alpha, beta)
    print("Value iteration algorithm")
    opt_val, opt_pol = value_iteration_result(model, gamma)
    print(opt_pol)
"""
if state.wage == self.W:
    for action in list_actions:
        #If you're in state W, you stay in state W with constant
        #Probability. The reward only depends on the action you
        #you have chosen
        submapping[action] = Constant((State(state.wage),
                                       state.wage*\
                                       (self.H-action.l-action.s)))
elif state.wage == self.W-1:
    for action in list_actions:
        s:int = action.s
        l:int = action.l
        #If you're in state W-1, you can either stay in your state
Exemplo n.º 15
0
if __name__ == '__main__':

    import matplotlib.pyplot as plt
    from pprint import pprint
    hours: int = 10
    wage_cap: int = 30
    alpha: float = 0.08
    beta: float = 0.82
    gamma: float = 0.95

    co: CareerOptimization = CareerOptimization(hours=hours,
                                                wage_cap=wage_cap,
                                                alpha=alpha,
                                                beta=beta)

    _, opt_policy = value_iteration_result(co, gamma=gamma)
    wages: Iterable[int] = range(1, co.wage_cap + 1)
    opt_actions: Mapping[int, Tuple[int, int]] = \
        {w: opt_policy.act(w).value for w in wages}
    searching: Sequence[int] = [s for _, (s, _) in opt_actions.items()]
    learning: Sequence[int] = [l for _, (_, l) in opt_actions.items()]
    working: Sequence[int] = [
        co.hours - s - l for _, (s, l) in opt_actions.items()
    ]
    pprint(opt_actions)
    plt.xticks(wages)
    p1 = plt.bar(wages, searching, color='red')
    p2 = plt.bar(wages, learning, color='blue')
    p3 = plt.bar(wages, working, color='green')
    plt.legend((p1[0], p2[0], p3[0]), ('Job-Searching', 'Learning', 'Working'))
    plt.grid(axis='y')