def __init__( self, mdp_rep_for_rl_pg: MDPRepForRLPG, reinforce: bool, batch_size: int, num_batches: int, num_action_samples: int, max_steps: int, actor_lambda: float, critic_lambda: float, score_func: Callable[[A, Sequence[float]], Sequence[float]], sample_actions_gen_func: Callable[[Sequence[float], int], Sequence[A]], fa_spec: FuncApproxSpec, pol_fa_spec: Sequence[FuncApproxSpec] ) -> None: self.mdp_rep: MDPRepForRLPG = mdp_rep_for_rl_pg self.reinforce: bool = reinforce self.batch_size: int = batch_size self.num_batches: int = num_batches self.num_action_samples: int = num_action_samples self.max_steps: int = max_steps self.actor_lambda: float = actor_lambda self.critic_lambda: float = critic_lambda self.score_func: Callable[[A, Sequence[float]], Sequence[float]] =\ score_func self.sample_actions_gen_func: Callable[[Sequence[float], int], Sequence[A]] =\ sample_actions_gen_func self.vf_fa: FuncApproxBase = fa_spec.get_vf_func_approx_obj() self.qvf_fa: FuncApproxBase = fa_spec.get_qvf_func_approx_obj() self.pol_fa: Sequence[FuncApproxBase] =\ [s.get_vf_func_approx_obj() for s in pol_fa_spec]
def actor_spec(neurons: Sequence[int], num_risky: int)\ -> Sequence[FuncApproxSpec]: alpha_beta_vars = [FuncApproxSpec( state_feature_funcs=[ lambda s: float(s[0]), lambda s: s[1] ], action_feature_funcs=[], dnn_spec=DNNSpec( neurons=neurons, hidden_activation=DNNSpec.relu, hidden_activation_deriv=DNNSpec.relu_deriv, output_activation=DNNSpec.softplus, output_activation_deriv=DNNSpec.softplus_deriv ) ) for _ in range(num_risky + 2)] means = [FuncApproxSpec( state_feature_funcs=[ lambda s: float(s[0]), lambda s: s[1] ], action_feature_funcs=[], dnn_spec=DNNSpec( neurons=neurons, hidden_activation=DNNSpec.relu, hidden_activation_deriv=DNNSpec.relu_deriv, output_activation=DNNSpec.identity, output_activation_deriv=DNNSpec.identity_deriv ) ) for _ in range(num_risky)] return alpha_beta_vars + means
def __init__(self, mdp_rep_for_rl: MDPRepForRLFA, softmax: bool, epsilon: float, num_episodes: int, max_steps: int, fa_spec: FuncApproxSpec) -> None: self.mdp_rep: MDPRepForRLFA = mdp_rep_for_rl self.softmax: bool = softmax self.epsilon: float = epsilon self.num_episodes: int = num_episodes self.max_steps: int = max_steps self.vf_fa: FuncApproxBase = fa_spec.get_vf_func_approx_obj() self.qvf_fa: FuncApproxBase = fa_spec.get_qvf_func_approx_obj() self.state_action_func = self.mdp_rep.state_action_func
def actor_spec(self) -> Tuple[FuncApproxSpec, FuncApproxSpec]: ff = lambda s: (1. + self.r)**float(s[0]) mean = FuncApproxSpec(state_feature_funcs=[ff], sa_feature_funcs=[lambda x, ff=ff: ff(x[0])], dnn_spec=None) variance = FuncApproxSpec( state_feature_funcs=[], sa_feature_funcs=[], dnn_spec=DNNSpec( neurons=[], hidden_activation=DNNSpec.log_squish, hidden_activation_deriv=DNNSpec.log_squish_deriv, output_activation=DNNSpec.pos_log_squish, output_activation_deriv=DNNSpec.pos_log_squish_deriv)) return mean, variance
def __init__(self, mdp_rep_for_rl: MDPRepForRLFA, exploring_start: bool, algorithm: TDAlgorithm, softmax: bool, epsilon: float, epsilon_half_life: float, lambd: float, num_episodes: int, batch_size: int, max_steps: int, state_feature_funcs: Sequence[Callable[[S], float]], sa_feature_funcs: Sequence[Callable[[Tuple[S, A]], float]], learning_rate: float, learning_rate_decay: float) -> None: super().__init__(mdp_rep_for_rl=mdp_rep_for_rl, exploring_start=exploring_start, softmax=softmax, epsilon=epsilon, epsilon_half_life=epsilon_half_life, num_episodes=num_episodes, max_steps=max_steps, fa_spec=FuncApproxSpec( state_feature_funcs=state_feature_funcs, sa_feature_funcs=sa_feature_funcs, dnn_spec=None, learning_rate=learning_rate, add_unit_feature=False)) self.vf_w: np.ndarray = np.zeros(self.vf_fa.num_features) self.qvf_w: np.ndarray = np.zeros(self.qvf_fa.num_features) self.vf_fa.params = [self.vf_w] self.qvf_fa.params = [self.qvf_w] self.algorithm: TDAlgorithm = algorithm self.gamma_lambda: float = self.mdp_rep.gamma * lambd self.batch_size: int = batch_size self.learning_rate_decay: float = learning_rate_decay
def get_actor_nu_spec() -> FuncApproxSpec: return FuncApproxSpec( state_feature_funcs=[], sa_feature_funcs=[], dnn_spec=DNNSpec( neurons=[], hidden_activation=DNNSpec.log_squish, hidden_activation_deriv=DNNSpec.log_squish_deriv, output_activation=DNNSpec.pos_log_squish, output_activation_deriv=DNNSpec.pos_log_squish_deriv))
def __init__(self, mdp_rep_for_adp: MDPRepForADP, num_samples: int, softmax: bool, epsilon: float, epsilon_half_life: float, tol: float, fa_spec: FuncApproxSpec) -> None: self.mdp_rep: MDPRepForADP = mdp_rep_for_adp self.num_samples: int = num_samples self.softmax: bool = softmax self.epsilon_func: Callable[[int], float] = get_epsilon_decay_func( epsilon, epsilon_half_life) self.tol: float = tol self.fa: FuncApproxBase = fa_spec.get_vf_func_approx_obj() self.state_action_func: Callable[[S], Set[A]] =\ self.mdp_rep.state_action_func
def critic_spec(neurons: Sequence[int]) -> FuncApproxSpec: return FuncApproxSpec( state_feature_funcs=[ lambda s: float(s[0]), lambda s: s[1] ], action_feature_funcs=[], dnn_spec=DNNSpec( neurons=neurons, hidden_activation=DNNSpec.relu, hidden_activation_deriv=DNNSpec.relu_deriv, output_activation=DNNSpec.identity, output_activation_deriv=DNNSpec.identity_deriv ) )
def __init__( self, mdp_rep_for_rl: MDPRepForRLFA, exploring_start: bool, softmax: bool, epsilon: float, epsilon_half_life: float, num_episodes: int, max_steps: int, fa_spec: FuncApproxSpec ) -> None: self.mdp_rep: MDPRepForRLFA = mdp_rep_for_rl self.exploring_start: bool = exploring_start self.softmax: bool = softmax self.epsilon_func: Callable[[int], float] = get_epsilon_decay_func( epsilon, epsilon_half_life ) self.num_episodes: int = num_episodes self.max_steps: int = max_steps self.vf_fa: FuncApproxBase = fa_spec.get_vf_func_approx_obj() self.qvf_fa: FuncApproxBase = fa_spec.get_qvf_func_approx_obj() self.state_action_func = self.mdp_rep.state_action_func
def __init__(self, mdp_rep_for_adp_pg: MDPRepForADPPG, num_state_samples: int, num_action_samples: int, tol: float, score_func: Callable[[A, Sequence[float]], Sequence[float]], sample_actions_gen_func: Callable[[Sequence[float], int], Sequence[A]], vf_fa_spec: FuncApproxSpec, pol_fa_spec: Sequence[FuncApproxSpec]) -> None: self.mdp_rep: MDPRepForADPPG = mdp_rep_for_adp_pg self.num_state_samples: int = num_state_samples self.num_action_samples: int = num_action_samples self.tol: float = tol self.score_func: Callable[[A, Sequence[float]], Sequence[float]] =\ score_func self.sample_actions_gen_func: Callable[[Sequence[float], int], Sequence[A]] =\ sample_actions_gen_func self.vf_fa: FuncApproxBase = vf_fa_spec.get_vf_func_approx_obj() self.pol_fa: Sequence[FuncApproxBase] =\ [s.get_vf_func_approx_obj() for s in pol_fa_spec]
def get_critic_spec(self, time_steps: int) -> FuncApproxSpec: tnu = self.get_nu() gam = 1. - self.gamma # noinspection PyShadowingNames def state_ff(state: Tuple[int, float], tnu=tnu, gam=gam) -> float: t = float(state[0]) * self.expiry / time_steps tte = self.expiry - t if tnu == 0: ret = tte + self.epsilon else: ret = (1. + (tnu * self.epsilon - 1.) * np.exp(-tnu * tte)) / tnu mult = state[1]**gam / gam if gam != 0 else np.log(state[1]) return ret**self.gamma * mult / np.exp(self.rho * t) return FuncApproxSpec(state_feature_funcs=[state_ff], action_feature_funcs=[], dnn_spec=None)
def critic_spec(self, neurons: Sequence[int]) -> FuncApproxSpec: def feature_func(state: StateType) -> float: t = float(state[0]) # noinspection PyPep8Naming W = state[1] term1 = self.rho**(-t) term2 = np.exp((self.mu - self.r)**2 / (2 * self.sigma**2) * t) term3 = np.exp(-self.gamma * (1. + self.r)**(self.time_steps - t) * W) return term1 * term2 * term3 return FuncApproxSpec( state_feature_funcs=[feature_func], sa_feature_funcs=[ lambda x, feature_func=feature_func: feature_func(x[0]) ], dnn_spec=DNNSpec(neurons=neurons, hidden_activation=DNNSpec.relu, hidden_activation_deriv=DNNSpec.relu_deriv, output_activation=DNNSpec.identity, output_activation_deriv=DNNSpec.identity_deriv))
def get_actor_mu_spec(self, time_steps: int) -> FuncApproxSpec: tnu = self.get_nu() # noinspection PyShadowingNames def state_ff(state: Tuple[int, float], tnu=tnu) -> float: tte = self.expiry * (1. - float(state[0]) / time_steps) if tnu == 0: ret = 1. / (tte + self.epsilon) else: ret = tnu / (1. + (tnu * self.epsilon - 1.) * np.exp(-tnu * tte)) return ret return FuncApproxSpec( state_feature_funcs=[state_ff], sa_feature_funcs=[lambda x, state_ff=state_ff: state_ff(x[0])], dnn_spec=DNNSpec(neurons=[], hidden_activation=DNNSpec.log_squish, hidden_activation_deriv=DNNSpec.log_squish_deriv, output_activation=DNNSpec.sigmoid, output_activation_deriv=DNNSpec.sigmoid_deriv))
def __init__( self, mdp_rep_for_rl: MDPRepForRLFA, exploring_start: bool, softmax: bool, epsilon: float, epsilon_half_life: float, num_episodes: int, batch_size: int, max_steps: int, state_feature_funcs: Sequence[Callable[[S], float]], sa_feature_funcs: Sequence[Callable[[Tuple[S, A]], float]]) -> None: super().__init__(mdp_rep_for_rl=mdp_rep_for_rl, exploring_start=exploring_start, softmax=softmax, epsilon=epsilon, epsilon_half_life=epsilon_half_life, num_episodes=num_episodes, max_steps=max_steps, fa_spec=FuncApproxSpec( state_feature_funcs=state_feature_funcs, sa_feature_funcs=sa_feature_funcs, dnn_spec=None, reglr_coeff=0., learning_rate=0., adam_params=(False, 0., 0.), add_unit_feature=True)) self.batch_size: int = batch_size
def __init__(self, mdp_rep_for_adp_pg: MDPRepForADPPG, num_state_samples: int, num_next_state_samples: int, num_action_samples: int, num_batches: int, max_steps: int, actor_lambda: float, critic_lambda: float, score_func: Callable[[A, Sequence[float]], Sequence[float]], sample_actions_gen_func: Callable[[Sequence[float], int], Sequence[A]], vf_fa_spec: FuncApproxSpec, pol_fa_spec: Sequence[FuncApproxSpec]) -> None: self.mdp_rep: MDPRepForADPPG = mdp_rep_for_adp_pg self.num_state_samples: int = num_state_samples self.num_next_state_samples: int = num_next_state_samples self.num_action_samples: int = num_action_samples self.num_batches: int = num_batches self.max_steps: int = max_steps self.actor_lambda: float = actor_lambda self.critic_lambda: float = critic_lambda self.score_func: Callable[[A, Sequence[float]], Sequence[float]] =\ score_func self.sample_actions_gen_func: Callable[[Sequence[float], int], Sequence[A]] =\ sample_actions_gen_func self.vf_fa: FuncApproxBase = vf_fa_spec.get_vf_func_approx_obj() self.pol_fa: Sequence[FuncApproxBase] =\ [s.get_vf_func_approx_obj() for s in pol_fa_spec]
} } gamma_val = 1.0 mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val) mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_rl_tabular() first_visit_flag = True softmax_flag = False episodes_limit = 10000 epsilon_val = 0.1 epsilon_half_life_val = 1000 max_steps_val = 1000 fa_spec_val = FuncApproxSpec(state_feature_funcs=[lambda s: float(s)], action_feature_funcs=[ lambda a: 1. if a == 'a' else 0., lambda a: 1. if a == 'b' else 0., lambda a: 1. if a == 'c' else 0., ], dnn_spec=None) mc_obj = MonteCarlo(mdp_rep_obj, first_visit_flag, softmax_flag, epsilon_val, epsilon_half_life_val, episodes_limit, max_steps_val, fa_spec_val) def policy_func(i: int) -> Mapping[str, float]: if i == 1: ret = {'a': 0.4, 'b': 0.6} elif i == 2: ret = {'a': 0.7, 'c': 0.3} elif i == 3: ret = {'b': 1.0} else:
num_samples = 30 this_softmax = True this_epsilon = 0.05 this_epsilon_half_life = 30 this_learning_rate = 0.1 this_learning_rate_decay = 1e6 this_lambd = 0.8 this_num_episodes = 3000 this_max_steps = 1000 this_tdl_fa_offline = True this_fa_spec = FuncApproxSpec( state_feature_funcs=FuncApproxBase.get_identity_feature_funcs( ic.lead_time + 1 ), action_feature_funcs=[lambda x: x], dnn_spec=DNNSpec( neurons=[2, 4], hidden_activation=DNNSpec.relu, hidden_activation_deriv=DNNSpec.relu_deriv ) ) raa = RunAllAlgorithms( mdp_refined=mdp_ref_obj, tolerance=this_tolerance, first_visit_mc=this_first_visit_mc, num_samples=num_samples, softmax=this_softmax, epsilon=this_epsilon, epsilon_half_life=this_epsilon_half_life, learning_rate=this_learning_rate,
this_epsilon = 0.05 this_epsilon_half_life = 100 this_learning_rate = 0.1 this_learning_rate_decay = 1e6 this_lambd = 0.8 this_num_episodes = 1000 this_max_steps = 1000 this_td_offline = True this_fa_spec = FuncApproxSpec( state_feature_funcs=FuncApproxBase.get_indicator_feature_funcs( mdp_ref_obj.all_states), action_feature_funcs=FuncApproxBase.get_indicator_feature_funcs( {m.name for m in Move}), dnn_spec=None # dnn_spec=DNNSpec( # neurons=[2, 4], # hidden_activation=DNNSpec.relu, # hidden_activation_deriv=DNNSpec.relu_deriv, # output_activation=DNNSpec.identity, # output_activation_deriv=DNNSpec.identity_deriv # ) ) raa = RunAllAlgorithms(mdp_refined=mdp_ref_obj, tolerance=this_tolerance, first_visit_mc=this_first_visit_mc, num_samples=this_num_samples, softmax=this_softmax, epsilon=this_epsilon, epsilon_half_life=this_epsilon_half_life,
num_state_samples_val = 100 num_next_state_samples_val = 25 num_action_samples_val = 20 num_batches_val = 100 max_steps_val = 100 actor_lambda_val = 0.95 critic_lambda_val = 0.95 state_ff = [ lambda s: 1. if s == 1 else 0., lambda s: 1. if s == 2 else 0., lambda s: 1. if s == 3 else 0. ] vf_fa_spec_val = FuncApproxSpec( state_feature_funcs=state_ff, sa_feature_funcs=[(lambda x, f=f: f(x[0])) for f in state_ff], dnn_spec=DNNSpec(neurons=[2], hidden_activation=DNNSpec.relu, hidden_activation_deriv=DNNSpec.relu_deriv, output_activation=DNNSpec.identity, output_activation_deriv=DNNSpec.identity_deriv)) pol_fa_spec_val = [ FuncApproxSpec( state_feature_funcs=state_ff, sa_feature_funcs=[(lambda x, f=f: f(x[0])) for f in state_ff], dnn_spec=DNNSpec(neurons=[3], hidden_activation=DNNSpec.relu, hidden_activation_deriv=DNNSpec.relu_deriv, output_activation=DNNSpec.sigmoid, output_activation_deriv=DNNSpec.sigmoid_deriv)) ] # noinspection PyPep8 this_score_func = lambda a, p: [
gamma_val = 0.9 mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val) mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_adp() num_samples_val = 100 softmax_flag = False epsilon_val = 0.0 epsilon_half_life_val = 30 tol_val = 1e-4 fa_spec_val = FuncApproxSpec( state_feature_funcs=[ lambda s: 1. if s == 1 else 0., lambda s: 1. if s == 2 else 0., lambda s: 1. if s == 3 else 0. ], action_feature_funcs=[], dnn_spec=DNNSpec( neurons=[2, 4], hidden_activation=DNNSpec.relu, hidden_activation_deriv=DNNSpec.relu_deriv ) ) adp_obj = ADP( mdp_rep_for_adp=mdp_rep_obj, num_samples=num_samples_val, softmax=softmax_flag, epsilon=epsilon_val, epsilon_half_life=epsilon_half_life_val, tol=tol_val, fa_spec=fa_spec_val )
this_num_episodes = 1000 this_batch_size = 10 this_max_steps = 1000 this_td_offline = True state_ffs = FuncApproxBase.get_indicator_feature_funcs( mdp_ref_obj.all_states) sa_ffs = [(lambda x, f=f: f(x[0])) for f in state_ffs] +\ [(lambda x, f=f: f(x[1])) for f in FuncApproxBase.get_indicator_feature_funcs( {m.name for m in Move} )] this_fa_spec = FuncApproxSpec( state_feature_funcs=state_ffs, sa_feature_funcs=sa_ffs, dnn_spec=None # dnn_spec=DNNSpec( # neurons=[2, 4], # hidden_activation=DNNSpec.relu, # hidden_activation_deriv=DNNSpec.relu_deriv, # output_activation=DNNSpec.identity, # output_activation_deriv=DNNSpec.identity_deriv # ) ) raa = RunAllAlgorithms(mdp_refined=mdp_ref_obj, tolerance=this_tolerance, exploring_start=exploring_start, first_visit_mc=this_first_visit_mc, num_samples=this_num_samples, softmax=this_softmax, epsilon=this_epsilon, epsilon_half_life=this_epsilon_half_life, learning_rate=this_learning_rate,
cons_util_func=util_func, beq_util_func=beq_util, discount_rate=rho) reinforce_val = True num_state_samples_val = 500 num_next_state_samples_val = 30 num_action_samples_val = 50 num_batches_val = 3000 actor_lambda_val = 0.99 critic_lambda_val = 0.99 actor_mu = FuncApproxSpec( state_feature_funcs=[], action_feature_funcs=[], dnn_spec=DNNSpec(neurons=[], hidden_activation=DNNSpec.log_squish, hidden_activation_deriv=DNNSpec.log_squish_deriv, output_activation=DNNSpec.sigmoid, output_activation_deriv=DNNSpec.sigmoid_deriv)) actor_nu = FuncApproxSpec( state_feature_funcs=[], action_feature_funcs=[], dnn_spec=DNNSpec(neurons=[], hidden_activation=DNNSpec.log_squish, hidden_activation_deriv=DNNSpec.log_squish_deriv, output_activation=DNNSpec.pos_log_squish, output_activation_deriv=DNNSpec.pos_log_squish_deriv)) actor_mean = FuncApproxSpec(state_feature_funcs=[], action_feature_funcs=[], dnn_spec=None) actor_variance = FuncApproxSpec(
learning_rate_val = 0.1 lambda_val = 0.7 episodes_limit = 10000 batch_size_val = 20 max_steps_val = 1000 offline_val = True state_ff = [lambda s: float(s)] sa_ff = [ lambda x: float(x[0]), lambda x: 1. if x[1] == 'a' else 0., lambda x: 1. if x[1] == 'b' else 0., lambda x: 1. if x[1] == 'c' else 0., ] fa_spec_val = FuncApproxSpec( state_feature_funcs=state_ff, sa_feature_funcs=sa_ff, dnn_spec=None, learning_rate=learning_rate_val ) esl_obj = TDLambda( mdp_rep_obj, exploring_start_val, algorithm_type, softmax_flag, epsilon_val, epsilon_half_life_val, lambda_val, episodes_limit, batch_size_val, max_steps_val, fa_spec_val, offline_val
} } } gamma_val = 0.9 mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val) mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_adp() num_state_samples_val = 100 num_action_samples_val = 100 tol_val = 1e-4 vf_fa_spec_val = FuncApproxSpec( state_feature_funcs=[ lambda s: 1. if s == 1 else 0., lambda s: 1. if s == 2 else 0., lambda s: 1. if s == 3 else 0. ], action_feature_funcs=[], dnn_spec=DNNSpec(neurons=[2, 4], hidden_activation=DNNSpec.relu, hidden_activation_deriv=DNNSpec.relu_deriv, output_activation=DNNSpec.identity, output_activation_deriv=DNNSpec.identity_deriv)) pol_fa_spec_val = [ FuncApproxSpec(state_feature_funcs=[ lambda s: 1. if s == 1 else 0., lambda s: 1. if s == 2 else 0., lambda s: 1. if s == 3 else 0. ], action_feature_funcs=[], dnn_spec=DNNSpec( neurons=[2, 4], hidden_activation=DNNSpec.relu, hidden_activation_deriv=DNNSpec.relu_deriv,
this_first_visit_mc = True this_num_samples = 30 this_softmax = False this_epsilon = 0.05 this_epsilon_half_life = 100 this_learning_rate = 0.1 this_learning_rate_decay = 1e6 this_lambd = 0.8 this_num_episodes = 10000 this_max_steps = 1000 this_td_offline = True this_fa_spec = FuncApproxSpec( state_feature_funcs=FuncApproxBase.get_indicator_feature_funcs( mdp_ref_obj.all_states), action_feature_funcs=FuncApproxBase.get_indicator_feature_funcs( {m.name for m in Move}), dnn_spec=DNNSpec(neurons=[2, 4], hidden_activation=DNNSpec.relu, hidden_activation_deriv=DNNSpec.relu_deriv)) raa = RunAllAlgorithms(mdp_refined=mdp_ref_obj, tolerance=this_tolerance, first_visit_mc=this_first_visit_mc, num_samples=this_num_samples, softmax=this_softmax, epsilon=this_epsilon, epsilon_half_life=this_epsilon_half_life, learning_rate=this_learning_rate, learning_rate_decay=this_learning_rate_decay, lambd=this_lambd,
def get_rl_fa_price(self, num_dt: int, method: str, exploring_start: bool, algorithm: TDAlgorithm, softmax: bool, epsilon: float, epsilon_half_life: float, lambd: float, num_paths: int, batch_size: int, feature_funcs: Sequence[ Callable[[Tuple[StateType, ActionType]], float]], neurons: Optional[Sequence[int]], learning_rate: float, learning_rate_decay: float, adam: Tuple[bool, float, float], offline: bool) -> float: dt = self.expiry / num_dt def sa_func(_: StateType) -> Set[ActionType]: return {True, False} # noinspection PyShadowingNames def terminal_state(s: StateType, num_dt=num_dt) -> bool: return s[0] > num_dt # noinspection PyShadowingNames def sr_func(s: StateType, a: ActionType, num_dt=num_dt) -> Tuple[StateType, float]: return self.state_reward_gen(s, a, num_dt) def init_s() -> StateType: return 0, np.array([self.spot_price]) def init_sa() -> Tuple[StateType, ActionType]: return init_s(), choice([True, False]) # noinspection PyShadowingNames mdp_rep_obj = MDPRepForRLFA(state_action_func=sa_func, gamma=1., terminal_state_func=terminal_state, state_reward_gen_func=sr_func, init_state_gen=init_s, init_state_action_gen=init_sa) fa_spec = FuncApproxSpec( state_feature_funcs=[], sa_feature_funcs=feature_funcs, dnn_spec=(None if neurons is None else (DNNSpec( neurons=neurons, hidden_activation=DNNSpec.log_squish, hidden_activation_deriv=DNNSpec.log_squish_deriv, output_activation=DNNSpec.pos_log_squish, output_activation_deriv=DNNSpec.pos_log_squish_deriv))), learning_rate=learning_rate, adam_params=adam, add_unit_feature=False) if method == "MC": rl_fa_obj = MonteCarlo(mdp_rep_for_rl=mdp_rep_obj, exploring_start=exploring_start, softmax=softmax, epsilon=epsilon, epsilon_half_life=epsilon_half_life, num_episodes=num_paths, max_steps=num_dt + 2, fa_spec=fa_spec) elif method == "TD0": rl_fa_obj = TD0(mdp_rep_for_rl=mdp_rep_obj, exploring_start=exploring_start, algorithm=algorithm, softmax=softmax, epsilon=epsilon, epsilon_half_life=epsilon_half_life, num_episodes=num_paths, max_steps=num_dt + 2, fa_spec=fa_spec) elif method == "TDL": rl_fa_obj = TDLambda(mdp_rep_for_rl=mdp_rep_obj, exploring_start=exploring_start, algorithm=algorithm, softmax=softmax, epsilon=epsilon, epsilon_half_life=epsilon_half_life, lambd=lambd, num_episodes=num_paths, batch_size=batch_size, max_steps=num_dt + 2, fa_spec=fa_spec, offline=offline) else: rl_fa_obj = TDLambdaExact(mdp_rep_for_rl=mdp_rep_obj, exploring_start=exploring_start, algorithm=algorithm, softmax=softmax, epsilon=epsilon, epsilon_half_life=epsilon_half_life, lambd=lambd, num_episodes=num_paths, batch_size=batch_size, max_steps=num_dt + 2, state_feature_funcs=[], sa_feature_funcs=feature_funcs, learning_rate=learning_rate, learning_rate_decay=learning_rate_decay) qvf = rl_fa_obj.get_qv_func_fa(None) # init_s = (0, np.array([self.spot_price])) # val_exec = qvf(init_s)(True) # val_cont = qvf(init_s)(False) # true_false_spot_max = max(val_exec, val_cont) all_paths = self.get_all_paths(num_paths, num_dt + 1) prices = np.zeros(num_paths) for path_num, path in enumerate(all_paths): steps = 0 price_seq = np.array([]) while steps <= num_dt: price_seq = np.append(price_seq, path[steps]) state = (steps, price_seq) exercise_price = np.exp(-self.ir(dt * steps)) *\ self.payoff(dt * steps, price_seq) continue_price = qvf(state)(False) steps += 1 if exercise_price > continue_price: prices[path_num] = exercise_price steps = num_dt + 1 # print(state) # print(exercise_price) # print(continue_price) # print(qvf(state)(True)) return np.average(prices)
def get_actor_mean_spec() -> FuncApproxSpec: return FuncApproxSpec(state_feature_funcs=[], sa_feature_funcs=[], dnn_spec=None)