) # send all states and actions to environment gridworld.start_state_hash = (0, 0) return gridworld if __name__ == "__main__": # pragma: no cover from introrl.dp_funcs.dp_value_iter import dp_value_iteration gridworld = get_gridworld() #gridworld.summ_print() gridworld.layout_print(vname='reward', fmt='', show_env_states=True, none_str='*') gridworld.save_to_pickle_file(fname=None) policy, state_value = dp_value_iteration(gridworld, do_summ_print=True, max_iter=1000, err_delta=0.001, gamma=0.9) policy.save_diagram(gridworld, inp_colorD=None, save_name='sutton_5x5_gridworld', show_arrows=True, scale=1.0, h_over_w=0.8)
get_sim = Model( RW, build_initial_model=True ) get_sim.collect_transition_data( num_det_calls=100, num_stoic_calls=10000 ) RW.layout.s_hash_print() #get_sim.num_calls_layout_print() #get_sim.min_num_calls_layout_print() env = EnvBaseline( s_hash_rowL=RW.s_hash_rowL, x_axis_label=RW.x_axis_label, y_axis_label=RW.y_axis_label ) get_sim.add_all_data_to_an_environment( env ) policy, state_value = dp_value_iteration( env, do_summ_print=True, fmt_V='%.3f', fmt_R='%.1f', max_iter=1000, err_delta=0.0001, gamma=0.9, iteration_prints=10) policy.save_diagram( RW, inp_colorD=None, save_name='dp_rw1000_policy', show_arrows=False, scale=0.5, h_over_w=0.8, show_terminal_labels=False) print( 'Total Time =',time.time() - start_time ) pickle_esp.save_to_pickle_file( fname='dp_soln_to_randwalk_1000', env=env, state_values=state_value, policy=policy)
from introrl.dp_funcs.dp_value_iter import dp_value_iteration from introrl.environments.env_baseline import EnvBaseline from introrl.mdp_data.sutton_5x5_gridworld import get_gridworld gridworld = get_gridworld() gridworld.name = 'Figure 3.5, 5x5 Grid Value Iteration' policy, state_value = dp_value_iteration( gridworld, do_summ_print=True,fmt_V='%.1f', max_iter=1000, err_delta=0.001, gamma=0.9, allow_multi_actions=True) policy.save_diagram( gridworld, inp_colorD=None, save_name='figure_3_5_policy', show_arrows=True, scale=0.8, h_over_w=0.8, do_show=True)
from introrl.mdp_data.fallen_3state_robot import get_robot robot = get_robot() do_VI = 0 if do_VI: print('_____________ Value Iteration ________________') else: print('_____________ Policy Iteration ________________') for gamma in (0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999): if do_VI: policy, sv = dp_value_iteration(robot, do_summ_print=False, fmt_V='%.1f', max_iter=1000, err_delta=0.001, gamma=gamma) else: policy = Policy(environment=robot) policy.set_policy_from_piD(robot.get_default_policy_desc_dict()) sv = StateValues(robot) sv.init_Vs_to_zero() dp_policy_iteration(policy, sv, do_summ_print=False, max_iter=1000, err_delta=0.001,
import matplotlib import matplotlib.pyplot as plt from introrl.dp_funcs.dp_value_iter import dp_value_iteration from introrl.mdp_data.gamblers_problem import get_gambler gambler = get_gambler(prob_heads=0.4) policy, state_value = dp_value_iteration(gambler, allow_multi_actions=True, do_summ_print=True, fmt_V='%.4f', max_iter=1000, err_delta=0.00001, gamma=1.0) print(gambler.get_info()) # --------------- plot logic ------------------- min_state_list = [] min_action_list = [] state_list = [] action_list = [] for i_state in range(1, 100): aL = policy.get_list_of_all_action_desc_prob(i_state, incl_zero_prob=False) min_state_list.append(i_state - 0.5) min_action_list.append(min([a for a, p in aL])) min_state_list.append(i_state + 0.5) min_action_list.append(min([a for a, p in aL]))