示例#1
0
def actor_critic_advantage_gaussian(
        mdp: MarkovDecisionProcess[S, float],
        policy_mean_approx0: FunctionApprox[NonTerminal[S]],
        q_value_func_approx0: QValueFunctionApprox[S, float],
        value_func_approx0: ValueFunctionApprox[S],
        start_states_distribution: NTStateDistribution[S], policy_stdev: float,
        gamma: float,
        max_episode_length: float) -> Iterator[FunctionApprox[NonTerminal[S]]]:
    policy_mean_approx: FunctionApprox[NonTerminal[S]] = policy_mean_approx0
    yield policy_mean_approx
    q: QValueFunctionApprox[S, float] = q_value_func_approx0
    v: ValueFunctionApprox[S] = value_func_approx0
    while True:
        steps: int = 0
        gamma_prod: float = 1.0
        state: NonTerminal[S] = start_states_distribution.sample()
        action: float = Gaussian(μ=policy_mean_approx(state),
                                 σ=policy_stdev).sample()
        while isinstance(state, NonTerminal) and steps < max_episode_length:
            next_state, reward = mdp.step(state, action).sample()
            if isinstance(next_state, NonTerminal):
                next_action: float = Gaussian(μ=policy_mean_approx(next_state),
                                              σ=policy_stdev).sample()
                q = q.update([((state, action), reward + gamma * q(
                    (next_state, next_action)))])
                v = v.update([(state, reward + gamma * v(next_state))])
                action = next_action
            else:
                q = q.update([((state, action), reward)])
                v = v.update([(state, reward)])

            def obj_deriv_out(states: Sequence[NonTerminal[S]],
                              actions: Sequence[float]) -> np.ndarray:
                return (policy_mean_approx.evaluate(states) -
                        np.array(actions)) / (policy_stdev * policy_stdev)

            grad: Gradient[FunctionApprox[NonTerminal[S]]] = \
                policy_mean_approx.objective_gradient(
                    xy_vals_seq=[(state, action)],
                    obj_deriv_out_fun=obj_deriv_out
            )
            scaled_grad: Gradient[FunctionApprox[NonTerminal[S]]] = \
                grad * gamma_prod * (q((state, action)) - v(state))
            policy_mean_approx = \
                policy_mean_approx.update_with_gradient(scaled_grad)
            yield policy_mean_approx
            gamma_prod *= gamma
            steps += 1
            state = next_state
示例#2
0
    def get_episode_rewards_actions(self) -> Tuple[ndarray, ndarray]:
        # Bayesian update based on the treatment in
        # https://people.eecs.berkeley.edu/~jordan/courses/260-spring10/lectures/lecture5.pdf
        # (Section 3 on page 5, where both the mean and the
        # variance are random)
        ep_rewards: ndarray = empty(self.time_steps)
        ep_actions: ndarray = empty(self.time_steps, dtype=int)
        bayes: List[Tuple[float, int, float, float]] =\
            [(self.theta0, self.n0, self.alpha0, self.beta0)] * self.num_arms

        for i in range(self.time_steps):
            mean_draws: Sequence[float] = [
                Gaussian(μ=theta,
                         σ=1 /
                         sqrt(n * Gamma(α=alpha, β=beta).sample())).sample()
                for theta, n, alpha, beta in bayes
            ]
            action: int = max(enumerate(mean_draws), key=itemgetter(1))[0]
            reward: float = self.arm_distributions[action].sample()
            theta, n, alpha, beta = bayes[action]
            bayes[action] = ((reward + n * theta) / (n + 1), n + 1,
                             alpha + 0.5, beta + 0.5 * n / (n + 1) *
                             (reward - theta) * (reward - theta))
            ep_rewards[i] = reward
            ep_actions[i] = action
        return ep_rewards, ep_actions
示例#3
0
def actor_critic_td_error_gaussian(
        mdp: MarkovDecisionProcess[S, float],
        policy_mean_approx0: FunctionApprox[NonTerminal[S]],
        value_func_approx0: ValueFunctionApprox[S],
        start_states_distribution: NTStateDistribution[S], policy_stdev: float,
        gamma: float,
        max_episode_length: float) -> Iterator[FunctionApprox[NonTerminal[S]]]:
    policy_mean_approx: FunctionApprox[NonTerminal[S]] = policy_mean_approx0
    yield policy_mean_approx
    vf: ValueFunctionApprox[S] = value_func_approx0
    while True:
        steps: int = 0
        gamma_prod: float = 1.0
        state: NonTerminal[S] = start_states_distribution.sample()
        while isinstance(state, NonTerminal) and steps < max_episode_length:
            action: float = Gaussian(μ=policy_mean_approx(state),
                                     σ=policy_stdev).sample()
            next_state, reward = mdp.step(state, action).sample()
            if isinstance(next_state, NonTerminal):
                td_target: float = reward + gamma * vf(next_state)
            else:
                td_target = reward
            td_error: float = td_target - vf(state)
            vf = vf.update([(state, td_target)])

            def obj_deriv_out(states: Sequence[NonTerminal[S]],
                              actions: Sequence[float]) -> np.ndarray:
                return (policy_mean_approx.evaluate(states) -
                        np.array(actions)) / (policy_stdev * policy_stdev)

            grad: Gradient[FunctionApprox[NonTerminal[S]]] = \
                policy_mean_approx.objective_gradient(
                    xy_vals_seq=[(state, action)],
                    obj_deriv_out_fun=obj_deriv_out
            )
            scaled_grad: Gradient[FunctionApprox[NonTerminal[S]]] = \
                grad * gamma_prod * td_error
            policy_mean_approx = \
                policy_mean_approx.update_with_gradient(scaled_grad)
            yield policy_mean_approx
            gamma_prod *= gamma
            steps += 1
            state = next_state
示例#4
0
                            num_state_samples=num_state_samples,
                            error_tolerance=error_tolerance)


if __name__ == '__main__':
    print("Solving Problem 3")
    from pprint import pprint
    spot_price_val: float = 100.0
    strike: float = 100.0
    is_call: bool = False
    expiry_val: int = 3
    rate_val: float = 0.05
    vol_val: float = 0.25
    num_steps_val: int = 300
    sigma: float = 1
    asset_price_distribution: SampledDistribution[float] = Gaussian(
        strike, sigma)

    if is_call:
        opt_payoff = lambda x: max(x - strike, 0)
    else:
        opt_payoff = lambda x: max(strike - x, 0)


    feature_funcs: Sequence[Callable[[Tuple[float, float]], float]] = \
        [
            lambda _: 1.,
            lambda w_x: w_x[0]
        ]
    dnn: DNNSpec = DNNSpec(neurons=[],
                           bias=False,
                           hidden_activation=lambda x: x,
示例#5
0
 def setUp(self):
     self.unit = Gaussian(1.0, 1.0, 100000)
     self.large = Gaussian(10.0, 30.0, 100000)
示例#6
0
            theta, n, alpha, beta = bayes[action]
            bayes[action] = ((reward + n * theta) / (n + 1), n + 1,
                             alpha + 0.5, beta + 0.5 * n / (n + 1) *
                             (reward - theta) * (reward - theta))
            ep_rewards[i] = reward
            ep_actions[i] = action
        return ep_rewards, ep_actions


if __name__ == '__main__':
    means_vars_data = [(9., 5.), (10., 2.), (0., 4.), (6., 10.), (2., 20.),
                       (4., 1.)]
    mu_star = max(means_vars_data, key=itemgetter(0))[0]
    steps = 1000
    episodes = 500
    guess_mean = 0.
    guess_stdev = 10.

    arm_distrs = [Gaussian(μ=m, σ=s) for m, s in means_vars_data]
    ts_gaussian = ThompsonSamplingGaussian(arm_distributions=arm_distrs,
                                           time_steps=steps,
                                           num_episodes=episodes,
                                           init_mean=guess_mean,
                                           init_stdev=guess_stdev)
    # exp_cum_regret = ts_gaussian.get_expected_cum_regret(mu_star)
    # print(exp_cum_regret)
    # exp_act_count = ts_gaussian.get_expected_action_counts()
    # print(exp_act_count)

    ts_gaussian.plot_exp_cum_regret_curve(mu_star)
示例#7
0
    from pprint import pprint

    steps: int = 4
    μ: float = 0.13
    σ: float = 0.2
    r: float = 0.07
    a: float = 1.0
    init_wealth: float = 1.0
    init_wealth_var: float = 0.1

    excess: float = μ - r
    var: float = σ * σ
    base_alloc: float = excess / (a * var)

    risky_ret: Sequence[Gaussian] = [Gaussian(μ=μ, σ=σ) for _ in range(steps)]
    riskless_ret: Sequence[float] = [r for _ in range(steps)]
    utility_function: Callable[[float], float] = lambda x: -np.exp(-a * x) / a
    alloc_choices: Sequence[float] = np.linspace(
        2 / 3 * base_alloc, 4 / 3 * base_alloc, 11
    )
    feature_funcs: Sequence[Callable[[Tuple[float, float]], float]] = [
        lambda _: 1.0,
        lambda w_x: w_x[0],
        lambda w_x: w_x[1],
        lambda w_x: w_x[1] * w_x[1],
    ]
    dnn: DNNSpec = DNNSpec(
        neurons=[],
        bias=False,
        hidden_activation=lambda x: x,
示例#8
0
def plot_gaussian_algorithms() -> None:
    means_vars_data = [
        (0., 10.),
        (2., 20.),
        (4., 1.),
        (6., 8.),
        (8., 4.),
        (9., 6.),
        (10., 4.)]
    mu_star = max(means_vars_data, key=itemgetter(0))[0]

    steps = 500
    episodes = 500

    eps = 0.3
    eps_hl = 400

    ci = 5
    mi = mu_star * 3.

    ts_mi = 0.
    ts_si = 10.

    lr = 0.1
    lr_decay = 20.

    arm_distrs = [Gaussian(μ=m, σ=s) for m, s in means_vars_data]

    greedy_opt_init = EpsilonGreedy(
        arm_distributions=arm_distrs,
        time_steps=steps,
        num_episodes=episodes,
        epsilon=0.,
        epsilon_half_life=1e8,
        count_init=ci,
        mean_init=mi
    )
    eps_greedy = EpsilonGreedy(
        arm_distributions=arm_distrs,
        time_steps=steps,
        num_episodes=episodes,
        epsilon=eps,
        epsilon_half_life=1e8,
        count_init=0,
        mean_init=0.
    )
    decay_eps_greedy = EpsilonGreedy(
        arm_distributions=arm_distrs,
        time_steps=steps,
        num_episodes=episodes,
        epsilon=eps,
        epsilon_half_life=eps_hl,
        count_init=0,
        mean_init=0.
    )
    ts = ThompsonSamplingGaussian(
        arm_distributions=arm_distrs,
        time_steps=steps,
        num_episodes=episodes,
        init_mean=ts_mi,
        init_stdev=ts_si
    )
    grad_bandits = GradientBandits(
        arm_distributions=arm_distrs,
        time_steps=steps,
        num_episodes=episodes,
        learning_rate=lr,
        learning_rate_decay=lr_decay
    )

    plot_colors = ['r', 'b', 'g', 'k', 'y']
    labels = [
        'Greedy, Optimistic Initialization',
        '$\epsilon$-Greedy',
        'Decaying $\epsilon$-Greedy',
        'Thompson Sampling',
        'Gradient Bandit'
    ]

    exp_cum_regrets = [
        greedy_opt_init.get_expected_cum_regret(mu_star),
        eps_greedy.get_expected_cum_regret(mu_star),
        decay_eps_greedy.get_expected_cum_regret(mu_star),
        ts.get_expected_cum_regret(mu_star),
        grad_bandits.get_expected_cum_regret(mu_star)
    ]

    x_vals = range(1, steps + 1)
    for i in range(len(exp_cum_regrets)):
        plt.plot(exp_cum_regrets[i], color=plot_colors[i], label=labels[i])
    plt.xlabel("Time Steps", fontsize=20)
    plt.ylabel("Expected Total Regret", fontsize=20)
    plt.title("Total Regret Curves", fontsize=25)
    plt.xlim(xmin=x_vals[0], xmax=x_vals[-1])
    plt.ylim(ymin=0.0)
    plt.grid(True)
    plt.legend(loc='upper left', fontsize=15)
    plt.show()

    exp_act_counts = [
        greedy_opt_init.get_expected_action_counts(),
        eps_greedy.get_expected_action_counts(),
        decay_eps_greedy.get_expected_action_counts(),
        ts.get_expected_action_counts(),
        grad_bandits.get_expected_action_counts()
    ]
    index = arange(len(means_vars_data))
    spacing = 0.4
    width = (1 - spacing) / len(exp_act_counts)

    for i in range(len(exp_act_counts)):
        plt.bar(
            index - (1 - spacing) / 2 + (i - 1.5) * width,
            exp_act_counts[i],
            width,
            color=plot_colors[i],
            label=labels[i]
        )
    plt.xlabel("Arms", fontsize=20)
    plt.ylabel("Expected Counts of Arms", fontsize=20)
    plt.title("Arms Counts Plot", fontsize=25)
    plt.xticks(
        index - 0.3,
        ["$\mu$=%.1f,$\sigma$=%.1f" % (m, s) for m, s in means_vars_data]
    )
    plt.legend(loc='upper left', fontsize=15)
    plt.tight_layout()
    plt.show()
示例#9
0

if __name__ == '__main__':

    from rl.distribution import Gaussian

    init_price_mean: float = 100.0
    init_price_stdev: float = 10.0
    num_shares: int = 100
    num_time_steps: int = 5
    alpha: float = 0.03
    beta: float = 0.05

    price_diff = [lambda p_s: beta * p_s.shares for _ in range(num_time_steps)]
    dynamics = [
        lambda p_s: Gaussian(μ=p_s.price - alpha * p_s.shares, σ=0.)
        for _ in range(num_time_steps)
    ]
    ffs = [
        lambda p_s: p_s.state.price * p_s.state.shares,
        lambda p_s: float(p_s.state.shares * p_s.state.shares)
    ]
    fa: FunctionApprox = LinearFunctionApprox.create(feature_functions=ffs)
    init_price_distrib: Gaussian = Gaussian(μ=init_price_mean,
                                            σ=init_price_stdev)

    ooe: OptimalOrderExecution = OptimalOrderExecution(
        shares=num_shares,
        time_steps=num_time_steps,
        avg_exec_price_diff=price_diff,
        price_dynamics=dynamics,
示例#10
0
 def act(self, state: NonTerminal[S]) -> Gaussian:
     return Gaussian(μ=self.function_approx(state), σ=self.stdev)
示例#11
0
    num_shares: int = 100
    x: int = 1
    num_time_steps: int = 5
    alpha: float = 0.03
    beta: float = 0.05
    mu_z: float = 0.
    sigma_z: float = 1.
    theta: float = 0.05
    pho: float = 1.

    price_diff = [
        lambda p_s: beta * p_s.shares * p_s.price - theta * p_s.price * p_s.x
        for _ in range(num_time_steps)
    ]
    dynamics = [
        lambda p_s: Gaussian(μ=p_s.price * mu_z, σ=p_s.price**2 * sigma_z)
        for _ in range(num_time_steps)
    ]
    #dynamics_x = [lambda p_s: p_s.x*pho + Uniform() for _ in range(num_time_steps)]
    ffs = [
        lambda p_s: p_s.price * p_s.shares,
        lambda p_s: float(p_s.shares * p_s.shares)
    ]
    fa: FunctionApprox = LinearFunctionApprox.create(feature_functions=ffs)
    init_price_distrib: Gaussian = Gaussian(μ=init_price_mean,
                                            σ=init_price_stdev)
    init_x_distrib: Gaussian = Constant(x)

    ooe: OptimalOrderExecutionCustomized = OptimalOrderExecutionCustomized(
        shares=num_shares,
        time_steps=num_time_steps,