def actor_critic_advantage_gaussian( mdp: MarkovDecisionProcess[S, float], policy_mean_approx0: FunctionApprox[NonTerminal[S]], q_value_func_approx0: QValueFunctionApprox[S, float], value_func_approx0: ValueFunctionApprox[S], start_states_distribution: NTStateDistribution[S], policy_stdev: float, gamma: float, max_episode_length: float) -> Iterator[FunctionApprox[NonTerminal[S]]]: policy_mean_approx: FunctionApprox[NonTerminal[S]] = policy_mean_approx0 yield policy_mean_approx q: QValueFunctionApprox[S, float] = q_value_func_approx0 v: ValueFunctionApprox[S] = value_func_approx0 while True: steps: int = 0 gamma_prod: float = 1.0 state: NonTerminal[S] = start_states_distribution.sample() action: float = Gaussian(μ=policy_mean_approx(state), σ=policy_stdev).sample() while isinstance(state, NonTerminal) and steps < max_episode_length: next_state, reward = mdp.step(state, action).sample() if isinstance(next_state, NonTerminal): next_action: float = Gaussian(μ=policy_mean_approx(next_state), σ=policy_stdev).sample() q = q.update([((state, action), reward + gamma * q( (next_state, next_action)))]) v = v.update([(state, reward + gamma * v(next_state))]) action = next_action else: q = q.update([((state, action), reward)]) v = v.update([(state, reward)]) def obj_deriv_out(states: Sequence[NonTerminal[S]], actions: Sequence[float]) -> np.ndarray: return (policy_mean_approx.evaluate(states) - np.array(actions)) / (policy_stdev * policy_stdev) grad: Gradient[FunctionApprox[NonTerminal[S]]] = \ policy_mean_approx.objective_gradient( xy_vals_seq=[(state, action)], obj_deriv_out_fun=obj_deriv_out ) scaled_grad: Gradient[FunctionApprox[NonTerminal[S]]] = \ grad * gamma_prod * (q((state, action)) - v(state)) policy_mean_approx = \ policy_mean_approx.update_with_gradient(scaled_grad) yield policy_mean_approx gamma_prod *= gamma steps += 1 state = next_state
def get_episode_rewards_actions(self) -> Tuple[ndarray, ndarray]: # Bayesian update based on the treatment in # https://people.eecs.berkeley.edu/~jordan/courses/260-spring10/lectures/lecture5.pdf # (Section 3 on page 5, where both the mean and the # variance are random) ep_rewards: ndarray = empty(self.time_steps) ep_actions: ndarray = empty(self.time_steps, dtype=int) bayes: List[Tuple[float, int, float, float]] =\ [(self.theta0, self.n0, self.alpha0, self.beta0)] * self.num_arms for i in range(self.time_steps): mean_draws: Sequence[float] = [ Gaussian(μ=theta, σ=1 / sqrt(n * Gamma(α=alpha, β=beta).sample())).sample() for theta, n, alpha, beta in bayes ] action: int = max(enumerate(mean_draws), key=itemgetter(1))[0] reward: float = self.arm_distributions[action].sample() theta, n, alpha, beta = bayes[action] bayes[action] = ((reward + n * theta) / (n + 1), n + 1, alpha + 0.5, beta + 0.5 * n / (n + 1) * (reward - theta) * (reward - theta)) ep_rewards[i] = reward ep_actions[i] = action return ep_rewards, ep_actions
def actor_critic_td_error_gaussian( mdp: MarkovDecisionProcess[S, float], policy_mean_approx0: FunctionApprox[NonTerminal[S]], value_func_approx0: ValueFunctionApprox[S], start_states_distribution: NTStateDistribution[S], policy_stdev: float, gamma: float, max_episode_length: float) -> Iterator[FunctionApprox[NonTerminal[S]]]: policy_mean_approx: FunctionApprox[NonTerminal[S]] = policy_mean_approx0 yield policy_mean_approx vf: ValueFunctionApprox[S] = value_func_approx0 while True: steps: int = 0 gamma_prod: float = 1.0 state: NonTerminal[S] = start_states_distribution.sample() while isinstance(state, NonTerminal) and steps < max_episode_length: action: float = Gaussian(μ=policy_mean_approx(state), σ=policy_stdev).sample() next_state, reward = mdp.step(state, action).sample() if isinstance(next_state, NonTerminal): td_target: float = reward + gamma * vf(next_state) else: td_target = reward td_error: float = td_target - vf(state) vf = vf.update([(state, td_target)]) def obj_deriv_out(states: Sequence[NonTerminal[S]], actions: Sequence[float]) -> np.ndarray: return (policy_mean_approx.evaluate(states) - np.array(actions)) / (policy_stdev * policy_stdev) grad: Gradient[FunctionApprox[NonTerminal[S]]] = \ policy_mean_approx.objective_gradient( xy_vals_seq=[(state, action)], obj_deriv_out_fun=obj_deriv_out ) scaled_grad: Gradient[FunctionApprox[NonTerminal[S]]] = \ grad * gamma_prod * td_error policy_mean_approx = \ policy_mean_approx.update_with_gradient(scaled_grad) yield policy_mean_approx gamma_prod *= gamma steps += 1 state = next_state
num_state_samples=num_state_samples, error_tolerance=error_tolerance) if __name__ == '__main__': print("Solving Problem 3") from pprint import pprint spot_price_val: float = 100.0 strike: float = 100.0 is_call: bool = False expiry_val: int = 3 rate_val: float = 0.05 vol_val: float = 0.25 num_steps_val: int = 300 sigma: float = 1 asset_price_distribution: SampledDistribution[float] = Gaussian( strike, sigma) if is_call: opt_payoff = lambda x: max(x - strike, 0) else: opt_payoff = lambda x: max(strike - x, 0) feature_funcs: Sequence[Callable[[Tuple[float, float]], float]] = \ [ lambda _: 1., lambda w_x: w_x[0] ] dnn: DNNSpec = DNNSpec(neurons=[], bias=False, hidden_activation=lambda x: x,
def setUp(self): self.unit = Gaussian(1.0, 1.0, 100000) self.large = Gaussian(10.0, 30.0, 100000)
theta, n, alpha, beta = bayes[action] bayes[action] = ((reward + n * theta) / (n + 1), n + 1, alpha + 0.5, beta + 0.5 * n / (n + 1) * (reward - theta) * (reward - theta)) ep_rewards[i] = reward ep_actions[i] = action return ep_rewards, ep_actions if __name__ == '__main__': means_vars_data = [(9., 5.), (10., 2.), (0., 4.), (6., 10.), (2., 20.), (4., 1.)] mu_star = max(means_vars_data, key=itemgetter(0))[0] steps = 1000 episodes = 500 guess_mean = 0. guess_stdev = 10. arm_distrs = [Gaussian(μ=m, σ=s) for m, s in means_vars_data] ts_gaussian = ThompsonSamplingGaussian(arm_distributions=arm_distrs, time_steps=steps, num_episodes=episodes, init_mean=guess_mean, init_stdev=guess_stdev) # exp_cum_regret = ts_gaussian.get_expected_cum_regret(mu_star) # print(exp_cum_regret) # exp_act_count = ts_gaussian.get_expected_action_counts() # print(exp_act_count) ts_gaussian.plot_exp_cum_regret_curve(mu_star)
from pprint import pprint steps: int = 4 μ: float = 0.13 σ: float = 0.2 r: float = 0.07 a: float = 1.0 init_wealth: float = 1.0 init_wealth_var: float = 0.1 excess: float = μ - r var: float = σ * σ base_alloc: float = excess / (a * var) risky_ret: Sequence[Gaussian] = [Gaussian(μ=μ, σ=σ) for _ in range(steps)] riskless_ret: Sequence[float] = [r for _ in range(steps)] utility_function: Callable[[float], float] = lambda x: -np.exp(-a * x) / a alloc_choices: Sequence[float] = np.linspace( 2 / 3 * base_alloc, 4 / 3 * base_alloc, 11 ) feature_funcs: Sequence[Callable[[Tuple[float, float]], float]] = [ lambda _: 1.0, lambda w_x: w_x[0], lambda w_x: w_x[1], lambda w_x: w_x[1] * w_x[1], ] dnn: DNNSpec = DNNSpec( neurons=[], bias=False, hidden_activation=lambda x: x,
def plot_gaussian_algorithms() -> None: means_vars_data = [ (0., 10.), (2., 20.), (4., 1.), (6., 8.), (8., 4.), (9., 6.), (10., 4.)] mu_star = max(means_vars_data, key=itemgetter(0))[0] steps = 500 episodes = 500 eps = 0.3 eps_hl = 400 ci = 5 mi = mu_star * 3. ts_mi = 0. ts_si = 10. lr = 0.1 lr_decay = 20. arm_distrs = [Gaussian(μ=m, σ=s) for m, s in means_vars_data] greedy_opt_init = EpsilonGreedy( arm_distributions=arm_distrs, time_steps=steps, num_episodes=episodes, epsilon=0., epsilon_half_life=1e8, count_init=ci, mean_init=mi ) eps_greedy = EpsilonGreedy( arm_distributions=arm_distrs, time_steps=steps, num_episodes=episodes, epsilon=eps, epsilon_half_life=1e8, count_init=0, mean_init=0. ) decay_eps_greedy = EpsilonGreedy( arm_distributions=arm_distrs, time_steps=steps, num_episodes=episodes, epsilon=eps, epsilon_half_life=eps_hl, count_init=0, mean_init=0. ) ts = ThompsonSamplingGaussian( arm_distributions=arm_distrs, time_steps=steps, num_episodes=episodes, init_mean=ts_mi, init_stdev=ts_si ) grad_bandits = GradientBandits( arm_distributions=arm_distrs, time_steps=steps, num_episodes=episodes, learning_rate=lr, learning_rate_decay=lr_decay ) plot_colors = ['r', 'b', 'g', 'k', 'y'] labels = [ 'Greedy, Optimistic Initialization', '$\epsilon$-Greedy', 'Decaying $\epsilon$-Greedy', 'Thompson Sampling', 'Gradient Bandit' ] exp_cum_regrets = [ greedy_opt_init.get_expected_cum_regret(mu_star), eps_greedy.get_expected_cum_regret(mu_star), decay_eps_greedy.get_expected_cum_regret(mu_star), ts.get_expected_cum_regret(mu_star), grad_bandits.get_expected_cum_regret(mu_star) ] x_vals = range(1, steps + 1) for i in range(len(exp_cum_regrets)): plt.plot(exp_cum_regrets[i], color=plot_colors[i], label=labels[i]) plt.xlabel("Time Steps", fontsize=20) plt.ylabel("Expected Total Regret", fontsize=20) plt.title("Total Regret Curves", fontsize=25) plt.xlim(xmin=x_vals[0], xmax=x_vals[-1]) plt.ylim(ymin=0.0) plt.grid(True) plt.legend(loc='upper left', fontsize=15) plt.show() exp_act_counts = [ greedy_opt_init.get_expected_action_counts(), eps_greedy.get_expected_action_counts(), decay_eps_greedy.get_expected_action_counts(), ts.get_expected_action_counts(), grad_bandits.get_expected_action_counts() ] index = arange(len(means_vars_data)) spacing = 0.4 width = (1 - spacing) / len(exp_act_counts) for i in range(len(exp_act_counts)): plt.bar( index - (1 - spacing) / 2 + (i - 1.5) * width, exp_act_counts[i], width, color=plot_colors[i], label=labels[i] ) plt.xlabel("Arms", fontsize=20) plt.ylabel("Expected Counts of Arms", fontsize=20) plt.title("Arms Counts Plot", fontsize=25) plt.xticks( index - 0.3, ["$\mu$=%.1f,$\sigma$=%.1f" % (m, s) for m, s in means_vars_data] ) plt.legend(loc='upper left', fontsize=15) plt.tight_layout() plt.show()
if __name__ == '__main__': from rl.distribution import Gaussian init_price_mean: float = 100.0 init_price_stdev: float = 10.0 num_shares: int = 100 num_time_steps: int = 5 alpha: float = 0.03 beta: float = 0.05 price_diff = [lambda p_s: beta * p_s.shares for _ in range(num_time_steps)] dynamics = [ lambda p_s: Gaussian(μ=p_s.price - alpha * p_s.shares, σ=0.) for _ in range(num_time_steps) ] ffs = [ lambda p_s: p_s.state.price * p_s.state.shares, lambda p_s: float(p_s.state.shares * p_s.state.shares) ] fa: FunctionApprox = LinearFunctionApprox.create(feature_functions=ffs) init_price_distrib: Gaussian = Gaussian(μ=init_price_mean, σ=init_price_stdev) ooe: OptimalOrderExecution = OptimalOrderExecution( shares=num_shares, time_steps=num_time_steps, avg_exec_price_diff=price_diff, price_dynamics=dynamics,
def act(self, state: NonTerminal[S]) -> Gaussian: return Gaussian(μ=self.function_approx(state), σ=self.stdev)
num_shares: int = 100 x: int = 1 num_time_steps: int = 5 alpha: float = 0.03 beta: float = 0.05 mu_z: float = 0. sigma_z: float = 1. theta: float = 0.05 pho: float = 1. price_diff = [ lambda p_s: beta * p_s.shares * p_s.price - theta * p_s.price * p_s.x for _ in range(num_time_steps) ] dynamics = [ lambda p_s: Gaussian(μ=p_s.price * mu_z, σ=p_s.price**2 * sigma_z) for _ in range(num_time_steps) ] #dynamics_x = [lambda p_s: p_s.x*pho + Uniform() for _ in range(num_time_steps)] ffs = [ lambda p_s: p_s.price * p_s.shares, lambda p_s: float(p_s.shares * p_s.shares) ] fa: FunctionApprox = LinearFunctionApprox.create(feature_functions=ffs) init_price_distrib: Gaussian = Gaussian(μ=init_price_mean, σ=init_price_stdev) init_x_distrib: Gaussian = Constant(x) ooe: OptimalOrderExecutionCustomized = OptimalOrderExecutionCustomized( shares=num_shares, time_steps=num_time_steps,