Пример #1
0
def run_qrm_save_model(alg_name, tester, curriculum, num_times, show_print):
    learning_params = tester.learning_params
    json_saver = Saver(alg_name, tester, curriculum)

    time_init = time.time()
    for n in range(num_times):
        random.seed(n)
        sess = tf.Session()

        curriculum.restart()
        # Creating the experience replay buffer
        prioritized_replay_beta_iters = learning_params.prioritized_replay_beta_iters
        if learning_params.prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = curriculum.total_steps

        replay_buffer, beta_schedule = create_experience_replay_buffer(
            learning_params.buffer_size, learning_params.prioritized_replay,
            learning_params.prioritized_replay_alpha,
            learning_params.prioritized_replay_beta0,
            prioritized_replay_beta_iters)

        # Creating policy bank
        task_aux = Game(tester.get_task_params(curriculum.get_current_task()))
        num_features = len(task_aux.get_features())
        num_actions = len(task_aux.get_actions())

        policy_bank = PolicyBankDQN(sess, num_actions, num_features,
                                    learning_params,
                                    tester.get_reward_machines())

        # Task loop
        while not curriculum.stop_learning():
            rm_file = curriculum.get_next_task()
            run_qrm_task(sess, rm_file, policy_bank, tester, curriculum,
                         replay_buffer, beta_schedule, show_print)

        # Save session
        if task_aux.params.game_type == "craftworld":
            save_model_path = '../model/' + str(
                task_aux.params.game_type) + '/' + task_aux.game.get_map_id(
                ) + '/' + str(alg_name)
        else:
            save_model_path = '../model/' + str(
                task_aux.params.game_type) + '/' + str(alg_name)

        print("Saving model to {} ...".format(save_model_path))
        saver = tf.train.Saver()
        saver.save(sess, save_model_path)

        tf.reset_default_graph()
        sess.close()

        # Backing up the results
        json_saver.save_results()

    tester.show_results()
    print("Time:", "%0.2f" % ((time.time() - time_init) / 60), "mins")
Пример #2
0
def run_qrm_experiments(alg_name, tester, curriculum, num_times, show_print):
    # Setting up the saver
    saver = Saver(alg_name, tester, curriculum)
    learning_params = tester.learning_params

    # Running the tasks 'num_times'
    time_init = time.time()
    for t in range(num_times):
        # Setting the random seed to 't'
        random.seed(t)
        sess = tf.Session()

        # Reseting default values
        curriculum.restart()

        # Creating the experience replay buffer
        replay_buffer, beta_schedule = create_experience_replay_buffer(
            learning_params.buffer_size, learning_params.prioritized_replay,
            learning_params.prioritized_replay_alpha,
            learning_params.prioritized_replay_beta0, curriculum.total_steps
            if learning_params.prioritized_replay_beta_iters is None else
            learning_params.prioritized_replay_beta_iters)

        # Creating policy bank
        task_aux = Game(tester.get_task_params(curriculum.get_current_task()))
        num_features = len(task_aux.get_features())
        num_actions = len(task_aux.get_actions())

        policy_bank = PolicyBankDQN(sess, num_actions, num_features,
                                    learning_params,
                                    tester.get_reward_machines())

        # Task loop
        while not curriculum.stop_learning():
            if show_print:
                print("Current step:", curriculum.get_current_step(), "from",
                      curriculum.total_steps)
            rm_file = curriculum.get_next_task()
            # Running 'task_rm_id' for one episode
            run_qrm_task(sess, rm_file, policy_bank, tester, curriculum,
                         replay_buffer, beta_schedule, show_print)
        tf.reset_default_graph()
        sess.close()

        # Backing up the results
        saver.save_results()

    # Showing results
    tester.show_results()
    print("Time:", "%0.2f" % ((time.time() - time_init) / 60), "mins")
Пример #3
0
def run_hrl_experiments(alg_name, tester, curriculum, num_times, show_print,
                        use_rm):
    """
        NOTE: To implement this baseline, we encode each option as a reward machine with one transition
        - use_rm: Indicates whether to prune options using the reward machine
    """

    # Setting up the saver
    saver = Saver(alg_name, tester, curriculum)
    learning_params = tester.learning_params

    # Running the tasks 'num_times'
    time_init = time.time()
    for t in range(num_times):

        # Setting the random seed to 't'
        random.seed(t)
        sess = tf.Session()

        # Reseting default values
        curriculum.restart()

        # Creating the experience replay buffer
        replay_buffer, beta_schedule = create_experience_replay_buffer(
            learning_params.buffer_size, learning_params.prioritized_replay,
            learning_params.prioritized_replay_alpha,
            learning_params.prioritized_replay_beta0, curriculum.total_steps
            if learning_params.prioritized_replay_beta_iters is None else
            learning_params.prioritized_replay_beta_iters)

        # Loading options for this experiment
        option_folder = "../experiments/%s/options/" % tester.get_world_name()

        options = [
        ]  # NOTE: The policy bank also uses this list (in the same order)
        option2file = []
        for option_file in _get_option_files(
                option_folder
        ):  # NOTE: The option id indicates what the option does (e.g. "a&!n")
            option = RewardMachine(join(option_folder, option_file + ".txt"))
            options.append(option)
            option2file.append(option_file)

        # getting num inputs and outputs net
        task_aux = Game(tester.get_task_params(curriculum.get_current_task()))
        num_features = len(task_aux.get_features())
        num_actions = len(task_aux.get_actions())

        # initializing the meta controllers (one metacontroller per task)
        meta_controllers = []
        reward_machines = tester.get_reward_machines()
        for i in range(len(reward_machines)):
            rm = reward_machines[i]
            num_states = len(rm.get_states())
            policy_name = "Reward_Machine_%d" % i
            mc = MetaController(sess, policy_name, options, option2file, rm,
                                use_rm, learning_params, num_features,
                                num_states, show_print)
            meta_controllers.append(mc)

        # initializing the bank of policies with one policy per option
        policy_bank = PolicyBankDQN(sess, num_actions, num_features,
                                    learning_params, options)

        # Task loop
        while not curriculum.stop_learning():
            if show_print:
                print("Current step:", curriculum.get_current_step(), "from",
                      curriculum.total_steps)
            rm_file = curriculum.get_next_task()

            # Running 'rm_file' for one episode
            run_hrl_baseline(sess, rm_file, meta_controllers, options,
                             policy_bank, tester, curriculum, replay_buffer,
                             beta_schedule, show_print)

        tf.reset_default_graph()
        sess.close()

        # Backing up the results
        saver.save_results()

    # Showing results
    tester.show_results()
    print("Time:", "%0.2f" % ((time.time() - time_init) / 60), "mins")
Пример #4
0
def run_aqrm_experiments(alg_name, tester, tester_learned, curriculum,
                         num_times, show_print):
    # Setting up the saver
    saver = Saver(alg_name, tester, curriculum)
    learning_params = tester.learning_params

    # Running the tasks 'num_times'
    time_init = time.time()
    for t in range(num_times):
        # Setting the random seed to 't'

        random.seed(t)
        sess = tf.Session()

        # Reseting default values
        curriculum.restart()

        # Creating the experience replay buffer
        replay_buffer, beta_schedule = create_experience_replay_buffer(
            learning_params.buffer_size, learning_params.prioritized_replay,
            learning_params.prioritized_replay_alpha,
            learning_params.prioritized_replay_beta0, curriculum.total_steps
            if learning_params.prioritized_replay_beta_iters is None else
            learning_params.prioritized_replay_beta_iters)

        # Creating policy bank
        task_aux = Game(tester.get_task_params(curriculum.get_current_task()))
        num_features = len(task_aux.get_features())
        num_actions = len(task_aux.get_actions())

        hypothesis_machine = tester.get_hypothesis_machine()
        policy_bank = PolicyBankDQN(sess, num_actions, num_features,
                                    learning_params, hypothesis_machine)
        all_traces = Traces()
        # Task loop
        num_episodes = 0
        while not curriculum.stop_learning():
            num_episodes += 1
            if len(all_traces.positive) > 0:
                print(all_traces)
                #pdb.set_trace()
            if show_print:
                print("Current step:", curriculum.get_current_step(), "from",
                      curriculum.total_steps)
            #underlying_rm_file = curriculum.get_next_task()
            rm_file_truth = '../experiments/office/reward_machines/t1.txt'  #set file path at beginning
            rm_file_learned = '../experiments/office/reward_machines/xyz.txt'  #this txt file should be updated in learning section, where run_aqrm_task can be repeated?
            # Running 'task_rm_id' for one episode
            all_events, found_reward = run_aqrm_task(sess, rm_file_truth,
                                                     rm_file_learned,
                                                     policy_bank, tester,
                                                     tester_learned,
                                                     curriculum, replay_buffer,
                                                     beta_schedule, show_print)

            expected_reward = hypothesis_machine.calculate_reward(all_events)
            #pdb.set_trace()
            if not found_reward == expected_reward:
                # the learning should happen here
                print("learning")

            # save the trace, it will be used to create an underlying reward automaton
            all_traces.add_trace(all_events, found_reward)

        tf.reset_default_graph()
        sess.close()

        # Backing up the results
        saver.save_results()

    # Showing results
    tester.show_results()
    print("Time:", "%0.2f" % ((time.time() - time_init) / 60), "mins")
Пример #5
0
def run_hrl_save_model(alg_name, tester, curriculum, num_times, show_print,
                       use_rm):
    """
        NOTE: To implement this baseline, we encode each option as a reward machine with one transition
        - use_rm: Indicates whether to prune options using the reward machine
    """

    # Setting up the saver
    json_saver = Saver(alg_name, tester, curriculum)
    learning_params = tester.learning_params

    # Running the tasks 'num_times'
    time_init = time.time()
    for t in range(num_times):

        # Setting the random seed to 't'
        random.seed(t)
        sess = tf.Session()

        # Reseting default values
        curriculum.restart()

        # Creating the experience replay buffer
        replay_buffer, beta_schedule = create_experience_replay_buffer(
            learning_params.buffer_size, learning_params.prioritized_replay,
            learning_params.prioritized_replay_alpha,
            learning_params.prioritized_replay_beta0, curriculum.total_steps
            if learning_params.prioritized_replay_beta_iters is None else
            learning_params.prioritized_replay_beta_iters)

        options, option2file = get_options_rm(tester)

        # getting num inputs and outputs net
        task_aux = Game(tester.get_task_params(curriculum.get_current_task()))
        num_features = len(task_aux.get_features())
        num_actions = len(task_aux.get_actions())

        # initializing the meta controllers (one metacontroller per task)
        meta_controllers = []
        reward_machines = tester.get_reward_machines()
        for i in range(len(reward_machines)):
            rm = reward_machines[i]
            num_states = len(rm.get_states())
            policy_name = "Reward_Machine_%d" % i
            mc = MetaController(sess, policy_name, options, option2file, rm,
                                use_rm, learning_params, num_features,
                                num_states, show_print)
            meta_controllers.append(mc)

        # initializing the bank of policies with one policy per option
        policy_bank = PolicyBankDQN(sess, num_actions, num_features,
                                    learning_params, options)
        # Task loop
        while not curriculum.stop_learning():
            if show_print:
                print("Current step:", curriculum.get_current_step(), "from",
                      curriculum.total_steps)
            rm_file = curriculum.get_next_task()

            # Running 'rm_file' for one episode
            run_hrl_baseline(sess, rm_file, meta_controllers, options,
                             policy_bank, tester, curriculum, replay_buffer,
                             beta_schedule, show_print)

        # Save session
        if task_aux.params.game_type == "craftworld":
            save_model_path = '../model/' + str(
                task_aux.params.game_type) + '/' + task_aux.game.get_map_id(
                ) + '/' + str(alg_name)
        else:
            save_model_path = '../model/' + str(
                task_aux.params.game_type) + '/' + str(alg_name)

        print("Saving model to {} ...".format(save_model_path))
        saver = tf.train.Saver()
        saver.save(sess, save_model_path)

        tf.reset_default_graph()
        sess.close()

        # Backing up the results
        json_saver.save_results()

    # Showing results
    tester.show_results()
    print("Time:", "%0.2f" % ((time.time() - time_init) / 60), "mins")
Пример #6
0
def run_options_save_model(alg_name, tester, curriculum, num_times,
                           show_print):
    # Setting up the saver
    json_saver = Saver(alg_name, tester, curriculum)
    learning_params = tester.learning_params

    # Running the tasks num_times
    time_init = time.time()
    for t in range(num_times):
        random.seed(t)
        sess = tf.Session()

        curriculum.restart()

        # Creating the experience replay buffer
        replay_buffer, beta_schedule = create_experience_replay_buffer(
            learning_params.buffer_size, learning_params.prioritized_replay,
            learning_params.prioritized_replay_alpha,
            learning_params.prioritized_replay_beta0, curriculum.total_steps
            if learning_params.prioritized_replay_beta_iters is None else
            learning_params.prioritized_replay_beta_iters)

        options, option2file = get_options_rm(tester)
        curr_option_id = 0
        # getting num inputs and outputs net
        task_aux = Game(tester.get_task_params(curriculum.get_current_task()))
        num_features = len(task_aux.get_features())
        num_actions = len(task_aux.get_actions())

        # initializing the bank of policies with one policy per option
        policy_bank = PolicyBankDQN(sess, num_actions, num_features,
                                    learning_params, options)

        # Task loop
        while not curriculum.stop_learning():
            if show_print:
                print("Current step:", curriculum.get_current_step(), "from",
                      curriculum.total_steps)

            rm_file = curriculum.get_next_task()
            # Running 'curr_option' for one episode
            curr_option_id = (curr_option_id + 1) % len(options)

            run_options_task(sess, rm_file, curr_option_id, options,
                             option2file, policy_bank, tester, curriculum,
                             replay_buffer, beta_schedule, show_print)

        # Save session
        if task_aux.params.game_type != "officeworld":
            save_model_path = '../model/' + str(
                task_aux.params.game_type) + '/' + task_aux.game.get_map_id(
                ) + '/' + str(alg_name)
        else:
            save_model_path = '../model/' + str(
                task_aux.params.game_type) + '/' + str(alg_name)

        print("Saving model to {} ...".format(save_model_path))
        saver = tf.train.Saver()
        saver.save(sess, save_model_path)

        tf.reset_default_graph()
        sess.close()

        # Backing up the results
        json_saver.save_results()

    # Showing results
    tester.show_results()
    print("Time:", "%0.2f" % ((time.time() - time_init) / 60), "mins")