示例#1
0
    mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_adp()

    num_samples_val = 100
    softmax_flag = False
    epsilon_val = 0.0
    epsilon_half_life_val = 30
    tol_val = 1e-4
    fa_spec_val = FuncApproxSpec(
        state_feature_funcs=[
            lambda s: 1. if s == 1 else 0., lambda s: 1.
            if s == 2 else 0., lambda s: 1. if s == 3 else 0.
        ],
        action_feature_funcs=[],
        dnn_spec=DNNSpec(neurons=[2, 4],
                         hidden_activation=DNNSpec.relu,
                         hidden_activation_deriv=DNNSpec.relu_deriv,
                         output_activation=DNNSpec.identity,
                         output_activation_deriv=DNNSpec.identity_deriv))
    adp_obj = ADP(mdp_rep_for_adp=mdp_rep_obj,
                  num_samples=num_samples_val,
                  softmax=softmax_flag,
                  epsilon=epsilon_val,
                  epsilon_half_life=epsilon_half_life_val,
                  tol=tol_val,
                  fa_spec=fa_spec_val)

    def policy_func(i: int) -> Mapping[str, float]:
        if i == 1:
            ret = {'a': 0.4, 'b': 0.6}
        elif i == 2:
            ret = {'a': 0.7, 'c': 0.3}
示例#2
0
                                     discount_rate=rho)

    reinforce_val = True
    num_state_samples_val = 500
    num_next_state_samples_val = 30
    num_action_samples_val = 50
    num_batches_val = 3000
    actor_lambda_val = 0.99
    critic_lambda_val = 0.99

    actor_mu = FuncApproxSpec(
        state_feature_funcs=[],
        action_feature_funcs=[],
        dnn_spec=DNNSpec(neurons=[],
                         hidden_activation=DNNSpec.log_squish,
                         hidden_activation_deriv=DNNSpec.log_squish_deriv,
                         output_activation=DNNSpec.sigmoid,
                         output_activation_deriv=DNNSpec.sigmoid_deriv))
    actor_nu = FuncApproxSpec(
        state_feature_funcs=[],
        action_feature_funcs=[],
        dnn_spec=DNNSpec(neurons=[],
                         hidden_activation=DNNSpec.log_squish,
                         hidden_activation_deriv=DNNSpec.log_squish_deriv,
                         output_activation=DNNSpec.pos_log_squish,
                         output_activation_deriv=DNNSpec.pos_log_squish_deriv))
    actor_mean = FuncApproxSpec(state_feature_funcs=[],
                                action_feature_funcs=[],
                                dnn_spec=None)
    actor_variance = FuncApproxSpec(
        state_feature_funcs=[],
示例#3
0
    def get_rl_fa_price(self, num_dt: int, method: str, exploring_start: bool,
                        algorithm: TDAlgorithm, softmax: bool, epsilon: float,
                        epsilon_half_life: float, lambd: float, num_paths: int,
                        batch_size: int, feature_funcs: Sequence[
                            Callable[[Tuple[StateType, ActionType]],
                                     float]], neurons: Optional[Sequence[int]],
                        learning_rate: float, learning_rate_decay: float,
                        adam: Tuple[bool, float,
                                    float], offline: bool) -> float:
        dt = self.expiry / num_dt

        def sa_func(_: StateType) -> Set[ActionType]:
            return {True, False}

        # noinspection PyShadowingNames
        def terminal_state(s: StateType, num_dt=num_dt) -> bool:
            return s[0] > num_dt

        # noinspection PyShadowingNames
        def sr_func(s: StateType,
                    a: ActionType,
                    num_dt=num_dt) -> Tuple[StateType, float]:
            return self.state_reward_gen(s, a, num_dt)

        def init_s() -> StateType:
            return 0, np.array([self.spot_price])

        def init_sa() -> Tuple[StateType, ActionType]:
            return init_s(), choice([True, False])

        # noinspection PyShadowingNames
        mdp_rep_obj = MDPRepForRLFA(state_action_func=sa_func,
                                    gamma=ALMOSTONEGAMMA,
                                    terminal_state_func=terminal_state,
                                    state_reward_gen_func=sr_func,
                                    init_state_gen=init_s,
                                    init_state_action_gen=init_sa)

        fa_spec = FuncApproxSpec(
            state_feature_funcs=[],
            sa_feature_funcs=feature_funcs,
            dnn_spec=(None if neurons is None else (DNNSpec(
                neurons=neurons,
                hidden_activation=DNNSpec.log_squish,
                hidden_activation_deriv=DNNSpec.log_squish_deriv,
                output_activation=DNNSpec.pos_log_squish,
                output_activation_deriv=DNNSpec.pos_log_squish_deriv))),
            learning_rate=learning_rate,
            adam_params=adam,
            add_unit_feature=False)

        if method == "MC":
            rl_fa_obj = MonteCarlo(mdp_rep_for_rl=mdp_rep_obj,
                                   exploring_start=exploring_start,
                                   softmax=softmax,
                                   epsilon=epsilon,
                                   epsilon_half_life=epsilon_half_life,
                                   num_episodes=num_paths,
                                   max_steps=num_dt + 2,
                                   fa_spec=fa_spec)
        elif method == "TD0":
            rl_fa_obj = TD0(mdp_rep_for_rl=mdp_rep_obj,
                            exploring_start=exploring_start,
                            algorithm=algorithm,
                            softmax=softmax,
                            epsilon=epsilon,
                            epsilon_half_life=epsilon_half_life,
                            num_episodes=num_paths,
                            max_steps=num_dt + 2,
                            fa_spec=fa_spec)
        elif method == "TDL":
            rl_fa_obj = TDLambda(mdp_rep_for_rl=mdp_rep_obj,
                                 exploring_start=exploring_start,
                                 algorithm=algorithm,
                                 softmax=softmax,
                                 epsilon=epsilon,
                                 epsilon_half_life=epsilon_half_life,
                                 lambd=lambd,
                                 num_episodes=num_paths,
                                 batch_size=batch_size,
                                 max_steps=num_dt + 2,
                                 fa_spec=fa_spec,
                                 offline=offline)
        elif method == "TDE":
            rl_fa_obj = TDLambdaExact(mdp_rep_for_rl=mdp_rep_obj,
                                      exploring_start=exploring_start,
                                      algorithm=algorithm,
                                      softmax=softmax,
                                      epsilon=epsilon,
                                      epsilon_half_life=epsilon_half_life,
                                      lambd=lambd,
                                      num_episodes=num_paths,
                                      batch_size=batch_size,
                                      max_steps=num_dt + 2,
                                      state_feature_funcs=[],
                                      sa_feature_funcs=feature_funcs,
                                      learning_rate=learning_rate,
                                      learning_rate_decay=learning_rate_decay)
        else:
            rl_fa_obj = LSPI(mdp_rep_for_rl=mdp_rep_obj,
                             exploring_start=exploring_start,
                             softmax=softmax,
                             epsilon=epsilon,
                             epsilon_half_life=epsilon_half_life,
                             num_episodes=num_paths,
                             batch_size=batch_size,
                             max_steps=num_dt + 2,
                             state_feature_funcs=[],
                             sa_feature_funcs=feature_funcs)

        qvf = rl_fa_obj.get_qv_func_fa(None)
        # init_s = (0, np.array([self.spot_price]))
        # val_exec = qvf(init_s)(True)
        # val_cont = qvf(init_s)(False)
        # true_false_spot_max = max(val_exec, val_cont)

        all_paths = self.get_all_paths(0.0, num_paths, num_dt)
        prices = np.zeros(num_paths)

        for path_num, path in enumerate(all_paths):
            steps = 0
            while steps <= num_dt:
                price_seq = path[:(steps + 1)]
                state = (steps, price_seq)
                exercise_price = np.exp(-self.ir(dt * steps)) *\
                    self.payoff(dt * steps, price_seq)
                continue_price = qvf(state)(False)
                steps += 1
                if exercise_price > continue_price:
                    prices[path_num] = exercise_price
                    steps = num_dt + 1
                    # print(state)
                    # print(exercise_price)
                    # print(continue_price)
                    # print(qvf(state)(True))

        return np.average(prices)
示例#4
0
 num_state_samples_val = 100
 num_next_state_samples_val = 25
 num_action_samples_val = 20
 num_batches_val = 100
 max_steps_val = 100
 actor_lambda_val = 0.95
 critic_lambda_val = 0.95
 vf_fa_spec_val = FuncApproxSpec(
     state_feature_funcs=[
         lambda s: 1. if s == 1 else 0., lambda s: 1.
         if s == 2 else 0., lambda s: 1. if s == 3 else 0.
     ],
     action_feature_funcs=[],
     dnn_spec=DNNSpec(neurons=[2],
                      hidden_activation=DNNSpec.relu,
                      hidden_activation_deriv=DNNSpec.relu_deriv,
                      output_activation=DNNSpec.identity,
                      output_activation_deriv=DNNSpec.identity_deriv))
 pol_fa_spec_val = [
     FuncApproxSpec(state_feature_funcs=[
         lambda s: 1. if s == 1 else 0., lambda s: 1.
         if s == 2 else 0., lambda s: 1. if s == 3 else 0.
     ],
                    action_feature_funcs=[],
                    dnn_spec=DNNSpec(
                        neurons=[3],
                        hidden_activation=DNNSpec.relu,
                        hidden_activation_deriv=DNNSpec.relu_deriv,
                        output_activation=DNNSpec.sigmoid,
                        output_activation_deriv=DNNSpec.sigmoid_deriv))
 ]
示例#5
0
        errors = np.array([x[-1][0] for x in all_fwd_prop]) - \
            np.array(supervisory_seq)
        return get_generalized_back_prop(
            dnn_params=self.params,
            layer_inputs=layer_inputs,
            factors=errors,
            dObj_dSL=np.ones_like(errors),
            decay_param=gamma_lambda,
            hidden_activation_deriv=self.hidden_activation_deriv
        )


if __name__ == '__main__':
    this_dnn_obj = DNNSpec(
        neurons=[2],
        hidden_activation=DNNSpec.relu,
        hidden_activation_deriv=DNNSpec.relu_deriv
    )
    nn = DNN(
        feature_funcs=FuncApproxBase.get_identity_feature_funcs(3),
        dnn_obj=this_dnn_obj,
        reglr_coeff=0.,
        learning_rate=1.,
        adam=True,
        adam_decay1=0.9,
        adam_decay2=0.999
    )
    init_eval = nn.get_func_eval((2.0, 3.0, -4.0))
    print(init_eval)

    x_pts = np.arange(-10.0, 10.0, 0.5)
示例#6
0
    num_samples_val = 100
    softmax_flag = False
    epsilon_val = 0.0
    epsilon_half_life_val = 30
    tol_val = 1e-4
    fa_spec_val = FuncApproxSpec(
        state_feature_funcs=[
            lambda s: 1. if s == 1 else 0.,
            lambda s: 1. if s == 2 else 0.,
            lambda s: 1. if s == 3 else 0.
        ],
        action_feature_funcs=[],
        dnn_spec=DNNSpec(
            neurons=[2, 4],
            hidden_activation=DNNSpec.relu,
            hidden_activation_deriv=DNNSpec.relu_deriv
        )
    )
    adp_obj = ADP(
        mdp_rep_for_adp=mdp_rep_obj,
        num_samples=num_samples_val,
        softmax=softmax_flag,
        epsilon=epsilon_val,
        epsilon_half_life=epsilon_half_life_val,
        tol=tol_val,
        fa_spec=fa_spec_val
    )

    def policy_func(i: int) -> Mapping[str, float]:
        if i == 1: