def get_lily_pads_mdp(n: int) -> MDPRefined: data = { i: { 'A': { i - 1: (i / n, 0.), i + 1: (1. - i / n, 1. if i == n - 1 else 0.) }, 'B': {j: (1 / n, 1. if j == n else 0.) for j in range(n + 1) if j != i} } for i in range(1, n) } data[0] = {'A': {0: (1., 0.)}, 'B': {0: (1., 0.)}} data[n] = {'A': {n: (1., 0.)}, 'B': {n: (1., 0.)}} gamma = 1.0 return MDPRefined(data, gamma)
def get_lily_pads_mdp(n: int) -> MDPRefined: # Data structure for MDP Problem. dict with each top-level key being a state, # and contents of that key is another dict with actions as keys, and each of those # is a mapping of possible successor states to a (probability, reward) tuple data = { i: { 'A': { i - 1: (i / n, 0.), i + 1: (1. - i / n, 1. if i == n - 1 else 0.) }, 'B': {j: (1 / n, 1. if j == n else 0.) for j in range(n + 1) if j != i} } for i in range(1, n) } # Transition probabilities for edge cases at i=0, i=n data[0] = {'A': {0: (1., 0.)}, 'B': {0: (1., 0.)}} data[n] = {'A': {n: (1., 0.)}, 'B': {n: (1., 0.)}} # Discount factor gamma = 1.0 return MDPRefined(data, gamma)
1: (0.3, 19.8), 2: (0.6, 16.7), 3: (0.1, 1.8) }, }, 3: { 'a': { 3: (1.0, 0.0) }, 'b': { 3: (1.0, 0.0) } } } gamma_val = 0.9 mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val) mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_adp() num_state_samples_val = 100 num_action_samples_val = 100 tol_val = 1e-4 vf_fa_spec_val = FuncApproxSpec( state_feature_funcs=[ lambda s: 1. if s == 1 else 0., lambda s: 1. if s == 2 else 0., lambda s: 1. if s == 3 else 0. ], action_feature_funcs=[], dnn_spec=DNNSpec(neurons=[2, 4], hidden_activation=DNNSpec.relu, hidden_activation_deriv=DNNSpec.relu_deriv, output_activation=DNNSpec.identity,
1: (0.2, 4.8), 2: (0.4, 9.2), 3: (0.4, -8.2) } }, 3: { 'a': { 3: (1.0, 0.0) }, 'b': { 3: (1.0, 0.0) } } } gamma_val = 1.0 mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val) mdp_rep_obj = MDPRepForRLFiniteSA(mdp_ref_obj1) algorithm_type = TDAlgorithm.SARSA softmax_flag = True epsilon_val = 0.1 alpha_val = 0.1 episodes_limit = 1000 max_steps_val = 1000 sarsa_obj = TD0(mdp_rep_obj, algorithm_type, softmax_flag, epsilon_val, alpha_val, episodes_limit, max_steps_val) policy_data = { 1: { 'a': 0.4, 'b': 0.6
1: (0.2, 4.8), 2: (0.4, 9.2), 3: (0.4, -8.2) } }, 3: { 'a': { 3: (1.0, 0.0) }, 'b': { 3: (1.0, 0.0) } } } gamma_val = 1.0 mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val) mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_rl_tabular() first_visit_flag = True softmax_flag = False episodes_limit = 10000 epsilon_val = 0.1 epsilon_half_life_val = 1000 max_steps_val = 1000 fa_spec_val = FuncApproxSpec(state_feature_funcs=[lambda s: float(s)], action_feature_funcs=[ lambda a: 1. if a == 'a' else 0., lambda a: 1. if a == 'b' else 0., lambda a: 1. if a == 'c' else 0., ], dnn_spec=None)
def get_mdp_refined(self) -> MDPRefined: return MDPRefined(self.get_mdp_refined_dict(), self.epoch_disc_factor)
def get_mdp_refined(self) -> MDPRefined: return MDPRefined(self.get_mdp_refined_dict(), gamma=1.)
1: (0.3, 19.8), 2: (0.6, 16.7), 3: (0.1, 1.8) }, }, 3: { (10, ): { 3: (1.0, 0.0) }, (-10, ): { 3: (1.0, 0.0) } } } gamma_val = 0.9 mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val) mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_adp_pg() reinforce_val = False num_state_samples_val = 100 num_next_state_samples_val = 25 num_action_samples_val = 20 num_batches_val = 100 max_steps_val = 100 actor_lambda_val = 0.95 critic_lambda_val = 0.95 state_ff = [ lambda s: 1. if s == 1 else 0., lambda s: 1. if s == 2 else 0., lambda s: 1. if s == 3 else 0. ]
for s, v1 in self.state_action_dict.items() for a in v1} ) ) if __name__ == '__main__': data = { 1: { 'a': {1: (0.3, 9.2), 2: (0.6, 4.5), 3: (0.1, 5.0)}, 'b': {2: (0.3, -0.5), 3: (0.7, 2.6)}, 'c': {1: (0.2, 4.8), 2: (0.4, -4.9), 3: (0.4, 0.0)} }, 2: { 'a': {1: (0.3, 9.8), 2: (0.6, 6.7), 3: (0.1, 1.8)}, 'c': {1: (0.2, 4.8), 2: (0.4, 9.2), 3: (0.4, -8.2)} }, 3: { 'a': {3: (1.0, 0.0)}, 'b': {3: (1.0, 0.0)} } } this_gamma = 0.95 mdp_refined_obj = MDPRefined(data, this_gamma) this_mdp_rep_for_rl = MDPRepForRLTabular(mdp_refined_obj) print(this_mdp_rep_for_rl.state_action_dict) print(this_mdp_rep_for_rl.terminal_states) print(this_mdp_rep_for_rl.state_reward_gen_dict)