def plot_distribution_at_time_all_processes( process1_traces: np.ndarray, process2_traces: np.ndarray, process3_traces: np.ndarray, ) -> None: from rl.gen_utils.plot_funcs import plot_list_of_curves num_traces = len(process1_traces) time_steps = len(process1_traces[0]) - 1 x1, y1 = get_terminal_histogram(process1_traces) x2, y2 = get_terminal_histogram(process2_traces) x3, y3 = get_terminal_histogram(process3_traces) plot_list_of_curves( [x1, x2, x3], [y1, y2, y3], ["r", "b", "g"], [ r"Process 1 ($\alpha_1=0.25$)", r"Process 2 ($\alpha_2=0.75$)", r"Process 3 ($\alpha_3=1.0$)", ], "Terminal Stock Price", "Counts", f"Terminal Price Counts (T={time_steps:d}, Traces={num_traces:d})", )
def plot_single_trace_all_processes(process1_trace: np.ndarray, process2_trace: np.ndarray, process3_trace: np.ndarray) -> None: from rl.gen_utils.plot_funcs import plot_list_of_curves traces_len = len(process1_trace) plot_list_of_curves( [range(traces_len)] * 3, [process1_trace, process2_trace, process3_trace], ["r", "b", "g"], [ r"Process 1 ($\alpha_1=0.25$)", r"Process 2 ($\alpha_2=0.75$)", r"Process 3 ($\alpha_3=1.0$)" ], "Time Steps", "Stock Price", "Single-Trace Simulation for Each Process")
print("Linear Model SGD") print("----------------") linear_model_rmse_seq: List[float] = [] for lfa in islice(get_linear_model().iterate_updates(training_data_gen), training_iterations): this_rmse: float = lfa.rmse(test_data) linear_model_rmse_seq.append(this_rmse) iter: int = len(linear_model_rmse_seq) print(f"Iteration {iter:d}: RMSE = {this_rmse:.3f}") print("DNN Model SGD") print("-------------") dnn_model_rmse_seq: List[float] = [] for dfa in islice(get_dnn_model().iterate_updates(training_data_gen), training_iterations): this_rmse: float = dfa.rmse(test_data) dnn_model_rmse_seq.append(this_rmse) iter: int = len(dnn_model_rmse_seq) print(f"Iteration {iter:d}: RMSE = {this_rmse:.3f}") x_vals = range(training_iterations) plot_list_of_curves( list_of_x_vals=[x_vals, x_vals], list_of_y_vals=[linear_model_rmse_seq, dnn_model_rmse_seq], list_of_colors=["b", "r"], list_of_curve_labels=["Linear Model", "Deep Neural Network Model"], x_label="Iterations of Gradient Descent", y_label="Root Mean Square Error", title="RMSE across Iterations of Gradient Descent")
def get_unit_sigmoid_func(alpha: float) -> Callable[[float], float]: return lambda x, alpha=alpha: 1. / (1 + (1 / np.where(x == 0, VSML, x) - 1) **alpha) if __name__ == '__main__': from rl.gen_utils.plot_funcs import plot_list_of_curves alpha = [2.0, 1.0, 0.5] colors = ["r", "b", "g"] labels = [(r"$\alpha$ = %.1f" % a) for a in alpha] logistics = [get_logistic_func(a) for a in alpha] x_vals = np.arange(-3.0, 3.01, 0.01) y_vals = [f(x_vals) for f in logistics] plot_list_of_curves([x_vals] * len(logistics), y_vals, colors, labels, title="Logistic Functions") alpha = [2.0, 1.0, 0.5] colors = ["r", "b", "g"] labels = [(r"$\alpha$ = %.1f" % a) for a in alpha] unit_sigmoids = [get_unit_sigmoid_func(a) for a in alpha] x_vals = np.arange(0.0, 1.01, 0.01) y_vals = [f(x_vals) for f in unit_sigmoids] plot_list_of_curves([x_vals] * len(logistics), y_vals, colors, labels, title="Unit-Sigmoid Functions")
opt_ex_bin_tree: OptimalExerciseBinTree = OptimalExerciseBinTree( spot_price=spot_price_val, payoff=opt_payoff, expiry=expiry_val, rate=rate_val, vol=vol_val, num_steps=num_steps_val, ) vf_seq, policy_seq = zip(*opt_ex_bin_tree.get_opt_vf_and_policy()) ex_boundary: Sequence[Tuple[ float, float]] = opt_ex_bin_tree.option_exercise_boundary( policy_seq, is_call) time_pts, ex_bound_pts = zip(*ex_boundary) label = ("Call" if is_call else "Put") + " Option Exercise Boundary" plot_list_of_curves( list_of_x_vals=[time_pts], list_of_y_vals=[ex_bound_pts], list_of_colors=["b"], list_of_curve_labels=[label], x_label="Time", y_label="Underlying Price", title=label, ) european: float = opt_ex_bin_tree.european_price(is_call, strike) print(f"European Price = {european:.3f}") am_price: float = vf_seq[0][0] print(f"American Price = {am_price:.3f}")
bin_tree_ex_boundary: Sequence[Tuple[float, float]] = \ opt_ex_bin_tree.option_exercise_boundary(policy_seq, False) bin_tree_x, bin_tree_y = zip(*bin_tree_ex_boundary) lspi_x, lspi_y = put_option_exercise_boundary(func=flspi, expiry=expiry_val, num_steps=num_steps_lspi, strike=strike_val) dql_x, dql_y = put_option_exercise_boundary(func=fdql, expiry=expiry_val, num_steps=num_steps_dql, strike=strike_val) plot_list_of_curves(list_of_x_vals=[lspi_x, dql_x, bin_tree_x], list_of_y_vals=[lspi_y, dql_y, bin_tree_y], list_of_colors=["b", "r", "g"], list_of_curve_labels=["LSPI", "DQL", "Binary Tree"], x_label="Time", y_label="Underlying Price", title="LSPI, DQL, Binary Tree Exercise Boundaries") scoring_data: np.ndarray = scoring_sim_data(expiry=expiry_val, num_steps=num_steps_scoring, num_paths=num_scoring_paths, spot_price=spot_price_val, rate=rate_val, vol=vol_val) print(f"European Put Price = {european_price:.3f}") print(f"Binary Tree Price = {bin_tree_price:.3f}") lspi_opt_price: float = option_price(
return([x for x, _ in pairs], [y for _, y in pairs]) if __name__ == '__main__': game_size = 100 snakes_ladders_map = {3:39, 7:48, 12:51, 20:41, 25:57, 28:35, 31:6, 38:1, 45:74, 49:8, 53:17, 60:85, 67:90, 69:92, 70:34, 76:37, 77:83, 82:63, 88:50, 94:42, 98:54} sl_mp = SnackLaddersMPFinite(snakes_ladders_map, game_size) #print(sl_mp) #Plot some traces start_d = Categorical({PlayerState(1):1}) process_traces=sl_mp.traces(start_d) list_traces = [] for i in range(5) : trace_points = generate_trace(process_traces) list_traces.append(trace_points) x_values = [] plot_list_of_curves([range(1, len(trace)+1) for trace in list_traces], list_traces, ['b', 'g', 'r', 'y', 'black'], \ [f'Trace {i}' for i in range(len(list_traces))], "Number of time steps", "Square on the board", "Snakes and Ladders game traces") #Generate number of time steps distribution : hist = generate_time_steps_distribution(process_traces, (10000)) plt.bar(hist[0], hist[1], width=1) plt.xlabel("Number of steps") plt.ylabel("Counts") plt.title("Number of time steps; Traces = 10000 ") plt.show()
# plot_list_of_curves( # [x, x, x, x, x], # [y0, y1, y2, y3, y4], # ["r", "b", "g", "k", "y"], # ["True", "REINFORCE", "Actor-Critic", "Actor-Critic with Advantage", # "Actor-Critic with TD Error"], # "Iteration", # "Action", # "Action for Initial Wealth at Time 0" # ) plot_list_of_curves( [x, x, x, x], [y0, y1, y2, y4], ["r", "b", "g", "k", "y"], ["True", "REINFORCE", "Actor-Critic", "Actor-Critic with TD Error"], "Iteration", "Action", "Action for Initial Wealth at Time 0" ) print("Policy Gradient Solution") print("------------------------") print() opt_policies: Sequence[FunctionApprox[NonTerminal[AssetAllocState]]] = \ list(itertools.islice(actor_critic_error_policies, 10000 * steps)) for t in range(steps): opt_alloc: float = np.mean([p(NonTerminal((init_wealth, t))) for p in opt_policies]) print(f"Time {t:d}: Optimal Risky Allocation = {opt_alloc:.3f}")
SnakesAndLadders(start=list_start_pos[i], end=list_end_pos[i]) ] grid_size = 100 #Below it corresponds to the execution of question 1/2 game = SnakesAndLaddersGame(grid_size=grid_size, grid=grid) print("Transition Map") print("--------------") print(game) #We use the process_traces function to get access to a distribution of time steps to finish the game array_length = process_traces(10000, 10000, game) x, y = get_terminal_histogram(array_length) plot_list_of_curves([x], [y], ["r"], [r"Snakes and Ladders Game"], "Time Steps to finish the game", "Counts", "Distribution of the time steps to finish the game") #QUESTION 4 game_reward = SnakesAndLaddersRewards(grid_size=grid_size, grid=grid) expected_steps = game_reward.get_value_function_vec(gamma=1) print(expected_steps) #QUESTION 5 rewards = process1_reward_traces(start_price=100, level_param=100, alpha1=0.25, time_steps=100, num_traces=100) value_function = compute_value_function(start_price=100, level_param=100,
γ=gamma ), num_transitions )) td_vf: np.ndarray = td_func.evaluate(nt_states) num_polynomials: int = 5 features: Sequence[Callable[[NonTerminal[int]], float]] = \ laguerre_state_features(num_polynomials) lstd_transitions: Iterable[TransitionStep[int]] = \ itertools.islice(transitions, num_transitions) epsilon: float = 1e-4 lstd_func: LinearFunctionApprox[NonTerminal[int]] = \ least_squares_td( transitions=lstd_transitions, feature_functions=features, γ=gamma, ε=epsilon ) lstd_vf: np.ndarray = lstd_func.evaluate(nt_states) x_vals: Sequence[int] = [s.state for s in nt_states] plot_list_of_curves([x_vals, x_vals, x_vals], [true_vf, td_vf, lstd_vf], [ "b", "g", "r" ], ["True Value Function", "Tabular TD Value Function", "LSTD Value Function"], x_label="States", y_label="Value Function", title="Tabular TD and LSTD versus True Value Function")
time_steps = [i * horizon / intervals for i in range(intervals)] optimal_consumption_rate: Sequence[float] = [ mp.fractional_consumption_rate(i) for i in time_steps ] expected_portfolio_return: float = mp.portfolio_return() expected_wealth_growth: Sequence[float] = [ mp.wealth_growth_rate(i) for i in time_steps ] plot_list_of_curves( [time_steps] * 3, [ optimal_consumption_rate, expected_wealth_growth, [expected_portfolio_return] * intervals ], ["b", "g", "r"], [ "Fractional Consumption Rate", "Expected Wealth Growth Rate", "Expected Portfolio Annual Return = %.1f%%" % (expected_portfolio_return * 100) ], x_label="Time in years", y_label="Annual Rate", title="Fractional Consumption and Expected Wealth Growth") extended_time_steps = time_steps + [horizon] expected_wealth: Sequence[float] = [ mp.expected_wealth(i) for i in extended_time_steps ] plot_list_of_curves([extended_time_steps], [expected_wealth], ["b"], ["Expected Wealth"], x_label="Time in Years", y_label="Wealth",
from rl.gen_utils.plot_funcs import plot_list_of_curves from rl.markov_process import NonTerminal villagers: int = 20 vampire_mdp: VampireMDP = VampireMDP(villagers) true_vf, true_policy = vampire_mdp.vi_vf_and_policy() pprint(true_vf) print(true_policy) lspi_vf, lspi_policy = vampire_mdp.lspi_vf_and_policy() pprint(lspi_vf) print(lspi_policy) states = range(1, villagers + 1) true_vf_vals = [true_vf[NonTerminal(s)] for s in states] lspi_vf_vals = [lspi_vf[NonTerminal(s)] for s in states] true_policy_actions = [true_policy.action_for[s] for s in states] lspi_policy_actions = [lspi_policy.action_for[s] for s in states] plot_list_of_curves( [states, states], [true_vf_vals, lspi_vf_vals], ["r", "b"], ["True Optimal VF", "LSPI-Estimated Optimal VF"], x_label="States", y_label="Optimal Values", title="True Optimal VF versus LSPI-Estimated Optimal VF") plot_list_of_curves( [states, states], [true_policy_actions, lspi_policy_actions], ["r", "b"], ["True Optimal Policy", "LSPI-Estimated Optimal Policy"], x_label="States", y_label="Optimal Actions", title="True Optimal Policy versus LSPI-Estimated Optimal Policy")
decay_eg_cum_regret = decay_eg.get_expected_cum_regret(mu_star) eg = EpsilonGreedy(arm_distributions=arm_distrs, time_steps=steps, num_episodes=episodes, epsilon=eps, epsilon_half_life=1e8, count_init=ci, mean_init=mi) eg_cum_regret = eg.get_expected_cum_regret(mu_star) greedy = EpsilonGreedy(arm_distributions=arm_distrs, time_steps=steps, num_episodes=episodes, epsilon=0.0, epsilon_half_life=1e8, count_init=ci, mean_init=mi) greedy_cum_regret = greedy.get_expected_cum_regret(mu_star) plot_list_of_curves( [range(1, steps + 1), range(1, steps + 1), range(1, steps + 1)], [greedy_cum_regret, eg_cum_regret, decay_eg_cum_regret], ["r", "b", "g"], ["Greedy", "$\epsilon$-Greedy", "Decaying $\epsilon$-Greedy"], x_label="Time Steps", y_label="Expected Total Regret", title="Total Regret")
plot_period: int = 200 start: int = 50 x_vals = [[ i * plot_period for i in range(start, int(num_episodes / plot_period)) ]] * 4 y_vals = [] for y in [y0, y1, y2, y4]: y_vals.append([ np.mean(y[i * plot_period:(i + 1) * plot_period]) for i in range(start, int(num_episodes / plot_period)) ]) print(x_vals) print(y_vals) plot_list_of_curves( x_vals, y_vals, ["k--", "r-x", "g-.", "b-"], ["True", "REINFORCE", "Actor-Critic", "Actor-Critic with TD Error"], "Iteration", "Action", "Action for Initial Wealth at Time 0") print("Policy Gradient Solution") print("------------------------") print() opt_policies: Sequence[FunctionApprox[NonTerminal[AssetAllocState]]] = \ list(itertools.islice(actor_critic_error_policies, 10000 * steps)) for t in range(steps): opt_alloc: float = np.mean( [p(NonTerminal((init_wealth, t))) for p in opt_policies]) print(f"Time {t:d}: Optimal Risky Allocation = {opt_alloc:.3f}") print()
def plot_finish_time_distr(finish_times: np.ndarray, num_traces: int): x_finish_time, y_counter = get_finish_time_histogram(finish_times) plot_list_of_curves([x_finish_time], [y_counter], ["b"], [f"Finish Time Distribution (traces={num_traces:d})"], "Finish Time", "Counts", r"Finish Time Distribution")