예제 #1
0
def get_lily_pads_mdp(n: int) -> MDPRefined:
    data = {
        i: {
            'A': {
                i - 1: (i / n, 0.),
                i + 1: (1. - i / n, 1. if i == n - 1 else 0.)
            },
            'B':
            {j: (1 / n, 1. if j == n else 0.)
             for j in range(n + 1) if j != i}
        }
        for i in range(1, n)
    }
    data[0] = {'A': {0: (1., 0.)}, 'B': {0: (1., 0.)}}
    data[n] = {'A': {n: (1., 0.)}, 'B': {n: (1., 0.)}}

    gamma = 1.0
    return MDPRefined(data, gamma)
예제 #2
0
def get_lily_pads_mdp(n: int) -> MDPRefined:
    # Data structure for MDP Problem. dict with each top-level key being a state,
    # and contents of that key is another dict with actions as keys, and each of those
    # is a mapping of possible successor states to a (probability, reward) tuple
    data = {
        i: {
            'A': {
                i - 1: (i / n, 0.),
                i + 1: (1. - i / n, 1. if i == n - 1 else 0.)
            },
            'B':
            {j: (1 / n, 1. if j == n else 0.)
             for j in range(n + 1) if j != i}
        }
        for i in range(1, n)
    }
    # Transition probabilities for edge cases at i=0, i=n
    data[0] = {'A': {0: (1., 0.)}, 'B': {0: (1., 0.)}}
    data[n] = {'A': {n: (1., 0.)}, 'B': {n: (1., 0.)}}

    # Discount factor
    gamma = 1.0
    return MDPRefined(data, gamma)
예제 #3
0
                1: (0.3, 19.8),
                2: (0.6, 16.7),
                3: (0.1, 1.8)
            },
        },
        3: {
            'a': {
                3: (1.0, 0.0)
            },
            'b': {
                3: (1.0, 0.0)
            }
        }
    }
    gamma_val = 0.9
    mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val)
    mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_adp()

    num_state_samples_val = 100
    num_action_samples_val = 100
    tol_val = 1e-4
    vf_fa_spec_val = FuncApproxSpec(
        state_feature_funcs=[
            lambda s: 1. if s == 1 else 0., lambda s: 1.
            if s == 2 else 0., lambda s: 1. if s == 3 else 0.
        ],
        action_feature_funcs=[],
        dnn_spec=DNNSpec(neurons=[2, 4],
                         hidden_activation=DNNSpec.relu,
                         hidden_activation_deriv=DNNSpec.relu_deriv,
                         output_activation=DNNSpec.identity,
예제 #4
0
                1: (0.2, 4.8),
                2: (0.4, 9.2),
                3: (0.4, -8.2)
            }
        },
        3: {
            'a': {
                3: (1.0, 0.0)
            },
            'b': {
                3: (1.0, 0.0)
            }
        }
    }
    gamma_val = 1.0
    mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val)
    mdp_rep_obj = MDPRepForRLFiniteSA(mdp_ref_obj1)

    algorithm_type = TDAlgorithm.SARSA
    softmax_flag = True
    epsilon_val = 0.1
    alpha_val = 0.1
    episodes_limit = 1000
    max_steps_val = 1000
    sarsa_obj = TD0(mdp_rep_obj, algorithm_type, softmax_flag, epsilon_val,
                    alpha_val, episodes_limit, max_steps_val)

    policy_data = {
        1: {
            'a': 0.4,
            'b': 0.6
예제 #5
0
                1: (0.2, 4.8),
                2: (0.4, 9.2),
                3: (0.4, -8.2)
            }
        },
        3: {
            'a': {
                3: (1.0, 0.0)
            },
            'b': {
                3: (1.0, 0.0)
            }
        }
    }
    gamma_val = 1.0
    mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val)
    mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_rl_tabular()

    first_visit_flag = True
    softmax_flag = False
    episodes_limit = 10000
    epsilon_val = 0.1
    epsilon_half_life_val = 1000
    max_steps_val = 1000
    fa_spec_val = FuncApproxSpec(state_feature_funcs=[lambda s: float(s)],
                                 action_feature_funcs=[
                                     lambda a: 1. if a == 'a' else 0.,
                                     lambda a: 1. if a == 'b' else 0.,
                                     lambda a: 1. if a == 'c' else 0.,
                                 ],
                                 dnn_spec=None)
예제 #6
0
 def get_mdp_refined(self) -> MDPRefined:
     return MDPRefined(self.get_mdp_refined_dict(), self.epoch_disc_factor)
예제 #7
0
 def get_mdp_refined(self) -> MDPRefined:
     return MDPRefined(self.get_mdp_refined_dict(), gamma=1.)
예제 #8
0
                1: (0.3, 19.8),
                2: (0.6, 16.7),
                3: (0.1, 1.8)
            },
        },
        3: {
            (10, ): {
                3: (1.0, 0.0)
            },
            (-10, ): {
                3: (1.0, 0.0)
            }
        }
    }
    gamma_val = 0.9
    mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val)
    mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_adp_pg()

    reinforce_val = False

    num_state_samples_val = 100
    num_next_state_samples_val = 25
    num_action_samples_val = 20
    num_batches_val = 100
    max_steps_val = 100
    actor_lambda_val = 0.95
    critic_lambda_val = 0.95
    state_ff = [
        lambda s: 1. if s == 1 else 0., lambda s: 1.
        if s == 2 else 0., lambda s: 1. if s == 3 else 0.
    ]
                 for s, v1 in self.state_action_dict.items() for a in v1}
            )
        )


if __name__ == '__main__':
    data = {
        1: {
            'a': {1: (0.3, 9.2), 2: (0.6, 4.5), 3: (0.1, 5.0)},
            'b': {2: (0.3, -0.5), 3: (0.7, 2.6)},
            'c': {1: (0.2, 4.8), 2: (0.4, -4.9), 3: (0.4, 0.0)}
        },
        2: {
            'a': {1: (0.3, 9.8), 2: (0.6, 6.7), 3: (0.1, 1.8)},
            'c': {1: (0.2, 4.8), 2: (0.4, 9.2), 3: (0.4, -8.2)}
        },
        3: {
            'a': {3: (1.0, 0.0)},
            'b': {3: (1.0, 0.0)}
        }
    }
    this_gamma = 0.95
    mdp_refined_obj = MDPRefined(data, this_gamma)
    this_mdp_rep_for_rl = MDPRepForRLTabular(mdp_refined_obj)
    print(this_mdp_rep_for_rl.state_action_dict)
    print(this_mdp_rep_for_rl.terminal_states)
    print(this_mdp_rep_for_rl.state_reward_gen_dict)