示例#1
0
def test_off_policy_mc_control(episodes=5, print_q_table_and_policy=False):  # episodes=100000
    """
    Performs Off-policy MC Control on multiple environments (separately).
    """
    method_name = 'Off-policy MC Control'

    # Frozen Lake:
    fl_env = FrozenLake()
    fl_model = MCControlModel(fl_env, episodes, eps_max=fl_env.EPS_MIN)
    fl_policy, fl_scores, fl_accumulated_scores = \
        fl_model.perform_off_policy_mc_control(print_info=print_q_table_and_policy)
    plot_running_average(fl_env.name, method_name, fl_scores, window=episodes//100)
    plot_accumulated_scores(fl_env.name, method_name, fl_accumulated_scores)
    fl_scores, fl_accumulated_scores = run_policy_table(fl_env, fl_policy, episodes)
    plot_running_average(fl_env.name, method_name, fl_scores, window=episodes//100)
    plot_accumulated_scores(fl_env.name, method_name, fl_accumulated_scores)

    # Blackjack:
    blackjack_env = Blackjack()
    blackjack_model = MCControlModel(blackjack_env, episodes, eps_max=0.05, eps_dec=1e-7)
    blackjack_policy, _, blackjack_accumulated_scores = \
        blackjack_model.perform_off_policy_mc_control(print_info=print_q_table_and_policy)
    plot_accumulated_scores(blackjack_env.name, method_name, blackjack_accumulated_scores)
    blackjack_accumulated_scores = run_policy_table(blackjack_env, blackjack_policy, episodes)
    plot_accumulated_scores(blackjack_env.name, method_name, blackjack_accumulated_scores)
示例#2
0
def play_ac(custom_env,
            n_episodes,
            fc_layers_dims,
            network_type,
            optimizer_type,
            alpha,
            beta,
            lib_type=LIBRARY_TORCH,
            enable_models_saving=False,
            load_checkpoint=False,
            plot=True,
            test=False):
    """
    :param network_type:
        NETWORK_TYPE_SHARED - very helpful in more complex environments (like LunarLander)
        NETWORK_TYPE_SEPARATE - suitable in less complex environments (like MountainCar)
    """

    custom_env.env.seed(28)

    set_device(lib_type, devices_dict=None)

    method_name = 'AC'
    base_dir = 'tmp/' + custom_env.file_name + '/' + method_name + '/'

    agent = Agent(custom_env,
                  fc_layers_dims,
                  network_type,
                  optimizer_type,
                  lr_actor=alpha,
                  lr_critic=beta,
                  lib_type=lib_type,
                  base_dir=base_dir)

    scores_history = train_agent(custom_env, agent, n_episodes,
                                 enable_models_saving, load_checkpoint)

    if plot:
        plot_running_average(
            custom_env.name,
            method_name,
            scores_history,
            # file_name=get_file_name(custom_env.file_name, agent, n_episodes, method_name) + '_train',
            directory=agent.chkpt_dir if enable_models_saving else None)

    scores_history_test = None
    if test:
        scores_history_test = run_trained_agent(custom_env, agent,
                                                enable_models_saving)
        if plot:
            plot_running_average(
                custom_env.name,
                method_name,
                scores_history_test,
                # file_name=get_file_name(custom_env.file_name, agent, n_episodes, method_name) + '_test',
                directory=agent.chkpt_dir if enable_models_saving else None)

    return agent, scores_history, scores_history_test
示例#3
0
def play_pg(custom_env,
            n_episodes,
            fc_layers_dims,
            optimizer_type,
            alpha,
            ep_batch_num,
            lib_type=LIBRARY_TF,
            enable_models_saving=False,
            load_checkpoint=False,
            plot=True,
            test=False):

    custom_env.env.seed(28)

    set_device(lib_type, devices_dict=None)

    method_name = 'PG'
    base_dir = 'tmp/' + custom_env.file_name + '/' + method_name + '/'

    agent = Agent(custom_env,
                  fc_layers_dims,
                  ep_batch_num,
                  alpha,
                  optimizer_type=optimizer_type,
                  lib_type=lib_type,
                  base_dir=base_dir)

    scores_history = train_agent(custom_env, agent, n_episodes, ep_batch_num,
                                 enable_models_saving, load_checkpoint)

    if plot:
        plot_running_average(
            custom_env.name,
            method_name,
            scores_history,
            # file_name=get_file_name(custom_env.file_name, agent, n_episodes, method_name) + '_train',
            directory=agent.chkpt_dir if enable_models_saving else None)

    scores_history_test = None
    if test:
        scores_history_test = run_trained_agent(custom_env, agent,
                                                enable_models_saving)
        if plot:
            plot_running_average(
                custom_env.name,
                method_name,
                scores_history_test,
                # file_name=get_file_name(custom_env.file_name, agent, n_episodes, method_name) + '_test',
                directory=agent.chkpt_dir if enable_models_saving else None)

    return agent, scores_history, scores_history_test
示例#4
0
def load_scores_history_and_plot(env_name,
                                 method_name,
                                 window,
                                 chkpt_dir,
                                 training_data=False,
                                 show_scores=False):
    try:
        print('...Loading scores_history...')
        suffix = '_train_total' if training_data else '_test'
        scores_history = pickle_load('scores_history' + suffix, chkpt_dir)
        plot_running_average(env_name,
                             method_name,
                             scores_history,
                             window,
                             show=show_scores,
                             file_name='scores_history' + suffix,
                             directory=chkpt_dir)

    except FileNotFoundError:
        print('...No scores history data to load...')
示例#5
0
def test_double_q_learning(episodes=5):
    """
    Performs Double Q Learning (Off-policy TD0 Control) on multiple environments (separately).
    """
    method_name = 'Double Q Learning'

    # Taxi:
    tx_env = Taxi()
    tx_model = TD0ControlModel(tx_env, episodes, alpha=0.4)  # episodes=10000
    tx_q1_table, tx_q2_table, tx_scores, _ = tx_model.perform_double_q_learning()
    plot_running_average(tx_env.name, method_name, tx_scores)
    tx_q1_scores, _ = run_q_table(tx_env, tx_q1_table, episodes)
    tx_q2_scores, _ = run_q_table(tx_env, tx_q2_table, episodes)
    scores_list = [tx_q1_scores, tx_q2_scores]
    labels = ['Q1', 'Q2']
    plot_running_average_comparison(tx_env.name + ' - ' + method_name, scores_list, labels)

    # Mountain Car:
    mc_env = MountainCar()
    mc_model = TD0ControlModel(mc_env, episodes)
    mc_q1_table, mc_q2_table, mc_scores, _ = mc_model.perform_double_q_learning()
    plot_running_average(mc_env.name, method_name, mc_scores)
    mc_q1_scores, _ = run_q_table(mc_env, mc_q1_table, episodes)
    mc_q2_scores, _ = run_q_table(mc_env, mc_q2_table, episodes)
    scores_list = [mc_q1_scores, mc_q2_scores]
    labels = ['Q1', 'Q2']
    plot_running_average_comparison(mc_env.name + ' - ' + method_name, scores_list, labels)

    # Cart Pole:
    cp_env = CartPole()
    cp_model = TD0ControlModel(cp_env, episodes)
    cp_q1_table, cp_q2_table, cp_scores, _ = cp_model.perform_double_q_learning()
    plot_running_average(cp_env.name, method_name, cp_scores)
    cp_q1_scores, _ = run_q_table(cp_env, cp_q1_table, episodes)
    cp_q2_scores, _ = run_q_table(cp_env, cp_q2_table, episodes)
    scores_list = [cp_q1_scores, cp_q2_scores]
    labels = ['Q1', 'Q2']
    plot_running_average_comparison(cp_env.name + ' - ' + method_name, scores_list, labels)
示例#6
0
def test_q_learning(episodes=5):
    """
    Performs Q Learning (Off-policy TD0 Control) on multiple environments (separately).
    """
    method_name = 'Q Learning'

    # Taxi:
    tx_env = Taxi()
    tx_model = TD0ControlModel(tx_env, episodes, alpha=0.4)  # episodes=10000
    tx_q_table, tx_scores, _ = tx_model.perform_q_learning()
    plot_running_average(tx_env.name, method_name, tx_scores)
    tx_scores, _ = run_q_table(tx_env, tx_q_table, episodes)
    plot_running_average(tx_env.name, method_name, tx_scores)

    # Mountain Car:
    mc_env = MountainCar()
    mc_model = TD0ControlModel(mc_env, episodes)
    mc_q_table, mc_scores, _ = mc_model.perform_q_learning()
    plot_running_average(mc_env.name, method_name, mc_scores)
    mc_scores, _ = run_q_table(mc_env, mc_q_table, episodes)
    plot_running_average(mc_env.name, method_name, mc_scores)

    # Cart Pole:
    cp_env = CartPole()
    cp_model = TD0ControlModel(cp_env, episodes)
    cp_q_table, cp_scores, _ = cp_model.perform_q_learning()
    plot_running_average(cp_env.name, method_name, cp_scores)
    cp_scores, _ = run_q_table(cp_env, cp_q_table, episodes)
    plot_running_average(cp_env.name, method_name, cp_scores)
示例#7
0
def test_expected_sarsa(episodes=5):
    """
    Performs Expected SARSA (On-policy TD0 Control) on multiple environments (separately).
    """
    method_name = 'Expected SARSA'

    # Taxi:
    tx_env = Taxi()
    tx_model = TD0ControlModel(tx_env, episodes, alpha=0.4)  # episodes=10000
    tx_q_table, tx_scores, _ = tx_model.perform_expected_sarsa()
    plot_running_average(tx_env.name, method_name, tx_scores)
    tx_scores, _ = run_q_table(tx_env, tx_q_table, episodes)
    plot_running_average(tx_env.name, method_name, tx_scores)

    # Mountain Car:
    mc_env = MountainCar()
    mc_model = TD0ControlModel(mc_env, episodes)
    mc_q_table, mc_scores, _ = mc_model.perform_expected_sarsa()
    plot_running_average(mc_env.name, method_name, mc_scores)
    mc_scores, _ = run_q_table(mc_env, mc_q_table, episodes)
    plot_running_average(mc_env.name, method_name, mc_scores)

    # Cart Pole (Solved):
    cp_env = CartPole()
    cp_model = TD0ControlModel(cp_env, episodes)
    cp_q_table, cp_scores, _ = cp_model.perform_expected_sarsa()
    plot_running_average(cp_env.name, method_name, cp_scores)
    cp_scores, _ = run_q_table(cp_env, cp_q_table, episodes)
    plot_running_average(cp_env.name, method_name, cp_scores)

    # Acrobot:
    ab_env = Acrobot()
    ab_model = TD0ControlModel(ab_env, episodes)
    ab_q_table, ab_scores, _ = ab_model.perform_expected_sarsa()
    plot_running_average(ab_env.name, method_name, ab_scores)
    ab_scores, _ = run_q_table(ab_env, ab_q_table, episodes)
    plot_running_average(ab_env.name, method_name, ab_scores)
示例#8
0
def play_dql(custom_env,
             n_episodes,
             fc_layers_dims,
             optimizer_type,
             alpha,
             double_dql,
             tau,
             lib_type=LIBRARY_TF,
             enable_models_saving=False,
             load_checkpoint=False,
             perform_random_gameplay=True,
             rnd_gameplay_episodes=None,
             plot=True,
             test=False):

    if not custom_env.is_discrete_action_space:
        print('\n', "Environment's Action Space should be discrete!", '\n')
        return

    custom_env.env.seed(28)

    set_device(lib_type, devices_dict=None)

    method_name = 'DQL'
    base_dir = 'tmp/' + custom_env.file_name + '/' + method_name + '/'

    agent = Agent(custom_env,
                  fc_layers_dims,
                  n_episodes,
                  alpha,
                  optimizer_type,
                  double_dql=double_dql,
                  tau=tau,
                  lib_type=lib_type,
                  base_dir=base_dir)

    scores_history = train_agent(custom_env, agent, n_episodes,
                                 enable_models_saving, load_checkpoint,
                                 perform_random_gameplay,
                                 rnd_gameplay_episodes)

    if plot:
        plot_running_average(
            custom_env.name,
            method_name,
            scores_history,
            # file_name=get_file_name(custom_env.file_name, agent, n_episodes, method_name) + '_train',
            directory=agent.chkpt_dir if enable_models_saving else None)

    scores_history_test = None
    if test:
        scores_history_test = run_trained_agent(custom_env, agent,
                                                enable_models_saving)
        if plot:
            plot_running_average(
                custom_env.name,
                method_name,
                scores_history_test,
                # file_name=get_file_name(custom_env.file_name, agent, n_episodes, method_name) + '_test',
                directory=agent.chkpt_dir if enable_models_saving else None)

    return agent, scores_history, scores_history_test
示例#9
0
def play_ddpg(custom_env,
              n_episodes,
              fc_layers_dims,
              optimizer_type,
              alpha,
              beta,
              lib_type=LIBRARY_TF,
              enable_models_saving=False,
              load_checkpoint=False,
              plot=True,
              test=False):

    if custom_env.is_discrete_action_space:
        print('\n', "Environment's Action Space should be continuous!", '\n')
        return

    if custom_env.input_type != INPUT_TYPE_OBSERVATION_VECTOR:
        print(
            '\n',
            'Algorithm currently works only with INPUT_TYPE_OBSERVATION_VECTOR!',
            '\n')
        return

    if lib_type == LIBRARY_KERAS:
        print('\n', "Algorithm currently doesn't work with Keras", '\n')
        return

    tau = 0.001

    custom_env.env.seed(28)

    set_device(lib_type, devices_dict=None)

    method_name = 'DDPG'
    base_dir = 'tmp/' + custom_env.file_name + '/' + method_name + '/'

    agent = Agent(custom_env,
                  fc_layers_dims,
                  tau,
                  optimizer_type,
                  alpha,
                  beta,
                  memory_batch_size=custom_env.memory_batch_size,
                  lib_type=lib_type,
                  base_dir=base_dir)

    scores_history = train_agent(custom_env, agent, n_episodes,
                                 enable_models_saving, load_checkpoint)

    if plot:
        plot_running_average(
            custom_env.name,
            method_name,
            scores_history,
            # file_name=get_file_name(custom_env.file_name, agent, n_episodes, method_name) + '_train',
            directory=agent.chkpt_dir if enable_models_saving else None)

    scores_history_test = None
    if test:
        scores_history_test = run_trained_agent(custom_env, agent,
                                                enable_models_saving)
        if plot:
            plot_running_average(
                custom_env.name,
                method_name,
                scores_history_test,
                # file_name=get_file_name(custom_env.file_name, agent, n_episodes, method_name) + '_test',
                directory=agent.chkpt_dir if enable_models_saving else None)

    return agent, scores_history, scores_history_test