def test_off_policy_mc_control(episodes=5, print_q_table_and_policy=False): # episodes=100000 """ Performs Off-policy MC Control on multiple environments (separately). """ method_name = 'Off-policy MC Control' # Frozen Lake: fl_env = FrozenLake() fl_model = MCControlModel(fl_env, episodes, eps_max=fl_env.EPS_MIN) fl_policy, fl_scores, fl_accumulated_scores = \ fl_model.perform_off_policy_mc_control(print_info=print_q_table_and_policy) plot_running_average(fl_env.name, method_name, fl_scores, window=episodes//100) plot_accumulated_scores(fl_env.name, method_name, fl_accumulated_scores) fl_scores, fl_accumulated_scores = run_policy_table(fl_env, fl_policy, episodes) plot_running_average(fl_env.name, method_name, fl_scores, window=episodes//100) plot_accumulated_scores(fl_env.name, method_name, fl_accumulated_scores) # Blackjack: blackjack_env = Blackjack() blackjack_model = MCControlModel(blackjack_env, episodes, eps_max=0.05, eps_dec=1e-7) blackjack_policy, _, blackjack_accumulated_scores = \ blackjack_model.perform_off_policy_mc_control(print_info=print_q_table_and_policy) plot_accumulated_scores(blackjack_env.name, method_name, blackjack_accumulated_scores) blackjack_accumulated_scores = run_policy_table(blackjack_env, blackjack_policy, episodes) plot_accumulated_scores(blackjack_env.name, method_name, blackjack_accumulated_scores)
def play_ac(custom_env, n_episodes, fc_layers_dims, network_type, optimizer_type, alpha, beta, lib_type=LIBRARY_TORCH, enable_models_saving=False, load_checkpoint=False, plot=True, test=False): """ :param network_type: NETWORK_TYPE_SHARED - very helpful in more complex environments (like LunarLander) NETWORK_TYPE_SEPARATE - suitable in less complex environments (like MountainCar) """ custom_env.env.seed(28) set_device(lib_type, devices_dict=None) method_name = 'AC' base_dir = 'tmp/' + custom_env.file_name + '/' + method_name + '/' agent = Agent(custom_env, fc_layers_dims, network_type, optimizer_type, lr_actor=alpha, lr_critic=beta, lib_type=lib_type, base_dir=base_dir) scores_history = train_agent(custom_env, agent, n_episodes, enable_models_saving, load_checkpoint) if plot: plot_running_average( custom_env.name, method_name, scores_history, # file_name=get_file_name(custom_env.file_name, agent, n_episodes, method_name) + '_train', directory=agent.chkpt_dir if enable_models_saving else None) scores_history_test = None if test: scores_history_test = run_trained_agent(custom_env, agent, enable_models_saving) if plot: plot_running_average( custom_env.name, method_name, scores_history_test, # file_name=get_file_name(custom_env.file_name, agent, n_episodes, method_name) + '_test', directory=agent.chkpt_dir if enable_models_saving else None) return agent, scores_history, scores_history_test
def play_pg(custom_env, n_episodes, fc_layers_dims, optimizer_type, alpha, ep_batch_num, lib_type=LIBRARY_TF, enable_models_saving=False, load_checkpoint=False, plot=True, test=False): custom_env.env.seed(28) set_device(lib_type, devices_dict=None) method_name = 'PG' base_dir = 'tmp/' + custom_env.file_name + '/' + method_name + '/' agent = Agent(custom_env, fc_layers_dims, ep_batch_num, alpha, optimizer_type=optimizer_type, lib_type=lib_type, base_dir=base_dir) scores_history = train_agent(custom_env, agent, n_episodes, ep_batch_num, enable_models_saving, load_checkpoint) if plot: plot_running_average( custom_env.name, method_name, scores_history, # file_name=get_file_name(custom_env.file_name, agent, n_episodes, method_name) + '_train', directory=agent.chkpt_dir if enable_models_saving else None) scores_history_test = None if test: scores_history_test = run_trained_agent(custom_env, agent, enable_models_saving) if plot: plot_running_average( custom_env.name, method_name, scores_history_test, # file_name=get_file_name(custom_env.file_name, agent, n_episodes, method_name) + '_test', directory=agent.chkpt_dir if enable_models_saving else None) return agent, scores_history, scores_history_test
def load_scores_history_and_plot(env_name, method_name, window, chkpt_dir, training_data=False, show_scores=False): try: print('...Loading scores_history...') suffix = '_train_total' if training_data else '_test' scores_history = pickle_load('scores_history' + suffix, chkpt_dir) plot_running_average(env_name, method_name, scores_history, window, show=show_scores, file_name='scores_history' + suffix, directory=chkpt_dir) except FileNotFoundError: print('...No scores history data to load...')
def test_double_q_learning(episodes=5): """ Performs Double Q Learning (Off-policy TD0 Control) on multiple environments (separately). """ method_name = 'Double Q Learning' # Taxi: tx_env = Taxi() tx_model = TD0ControlModel(tx_env, episodes, alpha=0.4) # episodes=10000 tx_q1_table, tx_q2_table, tx_scores, _ = tx_model.perform_double_q_learning() plot_running_average(tx_env.name, method_name, tx_scores) tx_q1_scores, _ = run_q_table(tx_env, tx_q1_table, episodes) tx_q2_scores, _ = run_q_table(tx_env, tx_q2_table, episodes) scores_list = [tx_q1_scores, tx_q2_scores] labels = ['Q1', 'Q2'] plot_running_average_comparison(tx_env.name + ' - ' + method_name, scores_list, labels) # Mountain Car: mc_env = MountainCar() mc_model = TD0ControlModel(mc_env, episodes) mc_q1_table, mc_q2_table, mc_scores, _ = mc_model.perform_double_q_learning() plot_running_average(mc_env.name, method_name, mc_scores) mc_q1_scores, _ = run_q_table(mc_env, mc_q1_table, episodes) mc_q2_scores, _ = run_q_table(mc_env, mc_q2_table, episodes) scores_list = [mc_q1_scores, mc_q2_scores] labels = ['Q1', 'Q2'] plot_running_average_comparison(mc_env.name + ' - ' + method_name, scores_list, labels) # Cart Pole: cp_env = CartPole() cp_model = TD0ControlModel(cp_env, episodes) cp_q1_table, cp_q2_table, cp_scores, _ = cp_model.perform_double_q_learning() plot_running_average(cp_env.name, method_name, cp_scores) cp_q1_scores, _ = run_q_table(cp_env, cp_q1_table, episodes) cp_q2_scores, _ = run_q_table(cp_env, cp_q2_table, episodes) scores_list = [cp_q1_scores, cp_q2_scores] labels = ['Q1', 'Q2'] plot_running_average_comparison(cp_env.name + ' - ' + method_name, scores_list, labels)
def test_q_learning(episodes=5): """ Performs Q Learning (Off-policy TD0 Control) on multiple environments (separately). """ method_name = 'Q Learning' # Taxi: tx_env = Taxi() tx_model = TD0ControlModel(tx_env, episodes, alpha=0.4) # episodes=10000 tx_q_table, tx_scores, _ = tx_model.perform_q_learning() plot_running_average(tx_env.name, method_name, tx_scores) tx_scores, _ = run_q_table(tx_env, tx_q_table, episodes) plot_running_average(tx_env.name, method_name, tx_scores) # Mountain Car: mc_env = MountainCar() mc_model = TD0ControlModel(mc_env, episodes) mc_q_table, mc_scores, _ = mc_model.perform_q_learning() plot_running_average(mc_env.name, method_name, mc_scores) mc_scores, _ = run_q_table(mc_env, mc_q_table, episodes) plot_running_average(mc_env.name, method_name, mc_scores) # Cart Pole: cp_env = CartPole() cp_model = TD0ControlModel(cp_env, episodes) cp_q_table, cp_scores, _ = cp_model.perform_q_learning() plot_running_average(cp_env.name, method_name, cp_scores) cp_scores, _ = run_q_table(cp_env, cp_q_table, episodes) plot_running_average(cp_env.name, method_name, cp_scores)
def test_expected_sarsa(episodes=5): """ Performs Expected SARSA (On-policy TD0 Control) on multiple environments (separately). """ method_name = 'Expected SARSA' # Taxi: tx_env = Taxi() tx_model = TD0ControlModel(tx_env, episodes, alpha=0.4) # episodes=10000 tx_q_table, tx_scores, _ = tx_model.perform_expected_sarsa() plot_running_average(tx_env.name, method_name, tx_scores) tx_scores, _ = run_q_table(tx_env, tx_q_table, episodes) plot_running_average(tx_env.name, method_name, tx_scores) # Mountain Car: mc_env = MountainCar() mc_model = TD0ControlModel(mc_env, episodes) mc_q_table, mc_scores, _ = mc_model.perform_expected_sarsa() plot_running_average(mc_env.name, method_name, mc_scores) mc_scores, _ = run_q_table(mc_env, mc_q_table, episodes) plot_running_average(mc_env.name, method_name, mc_scores) # Cart Pole (Solved): cp_env = CartPole() cp_model = TD0ControlModel(cp_env, episodes) cp_q_table, cp_scores, _ = cp_model.perform_expected_sarsa() plot_running_average(cp_env.name, method_name, cp_scores) cp_scores, _ = run_q_table(cp_env, cp_q_table, episodes) plot_running_average(cp_env.name, method_name, cp_scores) # Acrobot: ab_env = Acrobot() ab_model = TD0ControlModel(ab_env, episodes) ab_q_table, ab_scores, _ = ab_model.perform_expected_sarsa() plot_running_average(ab_env.name, method_name, ab_scores) ab_scores, _ = run_q_table(ab_env, ab_q_table, episodes) plot_running_average(ab_env.name, method_name, ab_scores)
def play_dql(custom_env, n_episodes, fc_layers_dims, optimizer_type, alpha, double_dql, tau, lib_type=LIBRARY_TF, enable_models_saving=False, load_checkpoint=False, perform_random_gameplay=True, rnd_gameplay_episodes=None, plot=True, test=False): if not custom_env.is_discrete_action_space: print('\n', "Environment's Action Space should be discrete!", '\n') return custom_env.env.seed(28) set_device(lib_type, devices_dict=None) method_name = 'DQL' base_dir = 'tmp/' + custom_env.file_name + '/' + method_name + '/' agent = Agent(custom_env, fc_layers_dims, n_episodes, alpha, optimizer_type, double_dql=double_dql, tau=tau, lib_type=lib_type, base_dir=base_dir) scores_history = train_agent(custom_env, agent, n_episodes, enable_models_saving, load_checkpoint, perform_random_gameplay, rnd_gameplay_episodes) if plot: plot_running_average( custom_env.name, method_name, scores_history, # file_name=get_file_name(custom_env.file_name, agent, n_episodes, method_name) + '_train', directory=agent.chkpt_dir if enable_models_saving else None) scores_history_test = None if test: scores_history_test = run_trained_agent(custom_env, agent, enable_models_saving) if plot: plot_running_average( custom_env.name, method_name, scores_history_test, # file_name=get_file_name(custom_env.file_name, agent, n_episodes, method_name) + '_test', directory=agent.chkpt_dir if enable_models_saving else None) return agent, scores_history, scores_history_test
def play_ddpg(custom_env, n_episodes, fc_layers_dims, optimizer_type, alpha, beta, lib_type=LIBRARY_TF, enable_models_saving=False, load_checkpoint=False, plot=True, test=False): if custom_env.is_discrete_action_space: print('\n', "Environment's Action Space should be continuous!", '\n') return if custom_env.input_type != INPUT_TYPE_OBSERVATION_VECTOR: print( '\n', 'Algorithm currently works only with INPUT_TYPE_OBSERVATION_VECTOR!', '\n') return if lib_type == LIBRARY_KERAS: print('\n', "Algorithm currently doesn't work with Keras", '\n') return tau = 0.001 custom_env.env.seed(28) set_device(lib_type, devices_dict=None) method_name = 'DDPG' base_dir = 'tmp/' + custom_env.file_name + '/' + method_name + '/' agent = Agent(custom_env, fc_layers_dims, tau, optimizer_type, alpha, beta, memory_batch_size=custom_env.memory_batch_size, lib_type=lib_type, base_dir=base_dir) scores_history = train_agent(custom_env, agent, n_episodes, enable_models_saving, load_checkpoint) if plot: plot_running_average( custom_env.name, method_name, scores_history, # file_name=get_file_name(custom_env.file_name, agent, n_episodes, method_name) + '_train', directory=agent.chkpt_dir if enable_models_saving else None) scores_history_test = None if test: scores_history_test = run_trained_agent(custom_env, agent, enable_models_saving) if plot: plot_running_average( custom_env.name, method_name, scores_history_test, # file_name=get_file_name(custom_env.file_name, agent, n_episodes, method_name) + '_test', directory=agent.chkpt_dir if enable_models_saving else None) return agent, scores_history, scores_history_test