def run_qrm_save_model(alg_name, tester, curriculum, num_times, show_print): learning_params = tester.learning_params json_saver = Saver(alg_name, tester, curriculum) time_init = time.time() for n in range(num_times): random.seed(n) sess = tf.Session() curriculum.restart() # Creating the experience replay buffer prioritized_replay_beta_iters = learning_params.prioritized_replay_beta_iters if learning_params.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = curriculum.total_steps replay_buffer, beta_schedule = create_experience_replay_buffer( learning_params.buffer_size, learning_params.prioritized_replay, learning_params.prioritized_replay_alpha, learning_params.prioritized_replay_beta0, prioritized_replay_beta_iters) # Creating policy bank task_aux = Game(tester.get_task_params(curriculum.get_current_task())) num_features = len(task_aux.get_features()) num_actions = len(task_aux.get_actions()) policy_bank = PolicyBankDQN(sess, num_actions, num_features, learning_params, tester.get_reward_machines()) # Task loop while not curriculum.stop_learning(): rm_file = curriculum.get_next_task() run_qrm_task(sess, rm_file, policy_bank, tester, curriculum, replay_buffer, beta_schedule, show_print) # Save session if task_aux.params.game_type == "craftworld": save_model_path = '../model/' + str( task_aux.params.game_type) + '/' + task_aux.game.get_map_id( ) + '/' + str(alg_name) else: save_model_path = '../model/' + str( task_aux.params.game_type) + '/' + str(alg_name) print("Saving model to {} ...".format(save_model_path)) saver = tf.train.Saver() saver.save(sess, save_model_path) tf.reset_default_graph() sess.close() # Backing up the results json_saver.save_results() tester.show_results() print("Time:", "%0.2f" % ((time.time() - time_init) / 60), "mins")
def run_dqn_save_model(alg_name, tester, curriculum, num_times, show_print): # Setting up the saver json_saver = Saver(alg_name, tester, curriculum) learning_params = tester.learning_params # Running the tasks 'num_times' time_init = time.time() for t in range(num_times): # Setting the random seed to 't' random.seed(t) sess = tf.Session() # Reseting default values curriculum.restart() # Creating policy bank task_aux = Game(tester.get_task_params(curriculum.get_current_task())) num_features = len(task_aux.get_features()) num_actions = len(task_aux.get_actions()) policy_bank = PolicyBank(sess, num_actions, num_features, learning_params, curriculum, tester.get_reward_machines()) # Task loop while not curriculum.stop_learning(): if show_print: print("Current step:", curriculum.get_current_step(), "from", curriculum.total_steps) rm_file = curriculum.get_next_task() run_dqn_baseline(sess, rm_file, policy_bank, tester, curriculum, show_print) # Save session if task_aux.params.game_type == "craftworld": save_model_path = '../model/' + str( task_aux.params.game_type) + '/' + task_aux.game.get_map_id( ) + '/' + str(alg_name) else: save_model_path = '../model/' + str( task_aux.params.game_type) + '/' + str(alg_name) print("Saving model to {} ...".format(save_model_path)) saver = tf.train.Saver() saver.save(sess, save_model_path) tf.reset_default_graph() sess.close() # Backing up the results json_saver.save_results() # Showing results tester.show_results() print("Time:", "%0.2f" % ((time.time() - time_init) / 60), "mins")
def run_qrm_experiments(alg_name, tester, curriculum, num_times, show_print): # Setting up the saver saver = Saver(alg_name, tester, curriculum) learning_params = tester.learning_params # Running the tasks 'num_times' time_init = time.time() for t in range(num_times): # Setting the random seed to 't' random.seed(t) sess = tf.Session() # Reseting default values curriculum.restart() # Creating the experience replay buffer replay_buffer, beta_schedule = create_experience_replay_buffer( learning_params.buffer_size, learning_params.prioritized_replay, learning_params.prioritized_replay_alpha, learning_params.prioritized_replay_beta0, curriculum.total_steps if learning_params.prioritized_replay_beta_iters is None else learning_params.prioritized_replay_beta_iters) # Creating policy bank task_aux = Game(tester.get_task_params(curriculum.get_current_task())) num_features = len(task_aux.get_features()) num_actions = len(task_aux.get_actions()) policy_bank = PolicyBankDQN(sess, num_actions, num_features, learning_params, tester.get_reward_machines()) # Task loop while not curriculum.stop_learning(): if show_print: print("Current step:", curriculum.get_current_step(), "from", curriculum.total_steps) rm_file = curriculum.get_next_task() # Running 'task_rm_id' for one episode run_qrm_task(sess, rm_file, policy_bank, tester, curriculum, replay_buffer, beta_schedule, show_print) tf.reset_default_graph() sess.close() # Backing up the results saver.save_results() # Showing results tester.show_results() print("Time:", "%0.2f" % ((time.time() - time_init) / 60), "mins")
def run_dqn_experiments(alg_name, tester, curriculum, num_times, show_print): # Setting up the saver saver = Saver(alg_name, tester, curriculum) learning_params = tester.learning_params # Running the tasks 'num_times' time_init = time.time() for t in range(num_times): # Setting the random seed to 't' random.seed(t) sess = tf.Session() # Reseting default values curriculum.restart() # Creating policy bank task_aux = Game(tester.get_task_params(curriculum.get_current_task())) num_features = len(task_aux.get_features()) num_actions = len(task_aux.get_actions()) policy_bank = PolicyBank(sess, num_actions, num_features, learning_params, curriculum, tester.get_reward_machines()) # Task loop while not curriculum.stop_learning(): if show_print: print("Current step:", curriculum.get_current_step(), "from", curriculum.total_steps) rm_file = curriculum.get_next_task() run_dqn_baseline(sess, rm_file, policy_bank, tester, curriculum, show_print) tf.reset_default_graph() sess.close() # Backing up the results saver.save_results() # Showing results tester.show_results() print("Time:", "%0.2f" % ((time.time() - time_init) / 60), "mins")
def run_hrl_experiments(alg_name, tester, curriculum, num_times, show_print, use_rm): """ NOTE: To implement this baseline, we encode each option as a reward machine with one transition - use_rm: Indicates whether to prune options using the reward machine """ # Setting up the saver saver = Saver(alg_name, tester, curriculum) learning_params = tester.learning_params # Running the tasks 'num_times' time_init = time.time() for t in range(num_times): # Setting the random seed to 't' random.seed(t) sess = tf.Session() # Reseting default values curriculum.restart() # Creating the experience replay buffer replay_buffer, beta_schedule = create_experience_replay_buffer( learning_params.buffer_size, learning_params.prioritized_replay, learning_params.prioritized_replay_alpha, learning_params.prioritized_replay_beta0, curriculum.total_steps if learning_params.prioritized_replay_beta_iters is None else learning_params.prioritized_replay_beta_iters) # Loading options for this experiment option_folder = "../experiments/%s/options/" % tester.get_world_name() options = [ ] # NOTE: The policy bank also uses this list (in the same order) option2file = [] for option_file in _get_option_files( option_folder ): # NOTE: The option id indicates what the option does (e.g. "a&!n") option = RewardMachine(join(option_folder, option_file + ".txt")) options.append(option) option2file.append(option_file) # getting num inputs and outputs net task_aux = Game(tester.get_task_params(curriculum.get_current_task())) num_features = len(task_aux.get_features()) num_actions = len(task_aux.get_actions()) # initializing the meta controllers (one metacontroller per task) meta_controllers = [] reward_machines = tester.get_reward_machines() for i in range(len(reward_machines)): rm = reward_machines[i] num_states = len(rm.get_states()) policy_name = "Reward_Machine_%d" % i mc = MetaController(sess, policy_name, options, option2file, rm, use_rm, learning_params, num_features, num_states, show_print) meta_controllers.append(mc) # initializing the bank of policies with one policy per option policy_bank = PolicyBankDQN(sess, num_actions, num_features, learning_params, options) # Task loop while not curriculum.stop_learning(): if show_print: print("Current step:", curriculum.get_current_step(), "from", curriculum.total_steps) rm_file = curriculum.get_next_task() # Running 'rm_file' for one episode run_hrl_baseline(sess, rm_file, meta_controllers, options, policy_bank, tester, curriculum, replay_buffer, beta_schedule, show_print) tf.reset_default_graph() sess.close() # Backing up the results saver.save_results() # Showing results tester.show_results() print("Time:", "%0.2f" % ((time.time() - time_init) / 60), "mins")
def run_aqrm_experiments(alg_name, tester, tester_learned, curriculum, num_times, show_print): # Setting up the saver saver = Saver(alg_name, tester, curriculum) learning_params = tester.learning_params # Running the tasks 'num_times' time_init = time.time() for t in range(num_times): # Setting the random seed to 't' random.seed(t) sess = tf.Session() # Reseting default values curriculum.restart() # Creating the experience replay buffer replay_buffer, beta_schedule = create_experience_replay_buffer( learning_params.buffer_size, learning_params.prioritized_replay, learning_params.prioritized_replay_alpha, learning_params.prioritized_replay_beta0, curriculum.total_steps if learning_params.prioritized_replay_beta_iters is None else learning_params.prioritized_replay_beta_iters) # Creating policy bank task_aux = Game(tester.get_task_params(curriculum.get_current_task())) num_features = len(task_aux.get_features()) num_actions = len(task_aux.get_actions()) hypothesis_machine = tester.get_hypothesis_machine() policy_bank = PolicyBankDQN(sess, num_actions, num_features, learning_params, hypothesis_machine) all_traces = Traces() # Task loop num_episodes = 0 while not curriculum.stop_learning(): num_episodes += 1 if len(all_traces.positive) > 0: print(all_traces) #pdb.set_trace() if show_print: print("Current step:", curriculum.get_current_step(), "from", curriculum.total_steps) #underlying_rm_file = curriculum.get_next_task() rm_file_truth = '../experiments/office/reward_machines/t1.txt' #set file path at beginning rm_file_learned = '../experiments/office/reward_machines/xyz.txt' #this txt file should be updated in learning section, where run_aqrm_task can be repeated? # Running 'task_rm_id' for one episode all_events, found_reward = run_aqrm_task(sess, rm_file_truth, rm_file_learned, policy_bank, tester, tester_learned, curriculum, replay_buffer, beta_schedule, show_print) expected_reward = hypothesis_machine.calculate_reward(all_events) #pdb.set_trace() if not found_reward == expected_reward: # the learning should happen here print("learning") # save the trace, it will be used to create an underlying reward automaton all_traces.add_trace(all_events, found_reward) tf.reset_default_graph() sess.close() # Backing up the results saver.save_results() # Showing results tester.show_results() print("Time:", "%0.2f" % ((time.time() - time_init) / 60), "mins")
def run_hrl_save_model(alg_name, tester, curriculum, num_times, show_print, use_rm): """ NOTE: To implement this baseline, we encode each option as a reward machine with one transition - use_rm: Indicates whether to prune options using the reward machine """ # Setting up the saver json_saver = Saver(alg_name, tester, curriculum) learning_params = tester.learning_params # Running the tasks 'num_times' time_init = time.time() for t in range(num_times): # Setting the random seed to 't' random.seed(t) sess = tf.Session() # Reseting default values curriculum.restart() # Creating the experience replay buffer replay_buffer, beta_schedule = create_experience_replay_buffer( learning_params.buffer_size, learning_params.prioritized_replay, learning_params.prioritized_replay_alpha, learning_params.prioritized_replay_beta0, curriculum.total_steps if learning_params.prioritized_replay_beta_iters is None else learning_params.prioritized_replay_beta_iters) options, option2file = get_options_rm(tester) # getting num inputs and outputs net task_aux = Game(tester.get_task_params(curriculum.get_current_task())) num_features = len(task_aux.get_features()) num_actions = len(task_aux.get_actions()) # initializing the meta controllers (one metacontroller per task) meta_controllers = [] reward_machines = tester.get_reward_machines() for i in range(len(reward_machines)): rm = reward_machines[i] num_states = len(rm.get_states()) policy_name = "Reward_Machine_%d" % i mc = MetaController(sess, policy_name, options, option2file, rm, use_rm, learning_params, num_features, num_states, show_print) meta_controllers.append(mc) # initializing the bank of policies with one policy per option policy_bank = PolicyBankDQN(sess, num_actions, num_features, learning_params, options) # Task loop while not curriculum.stop_learning(): if show_print: print("Current step:", curriculum.get_current_step(), "from", curriculum.total_steps) rm_file = curriculum.get_next_task() # Running 'rm_file' for one episode run_hrl_baseline(sess, rm_file, meta_controllers, options, policy_bank, tester, curriculum, replay_buffer, beta_schedule, show_print) # Save session if task_aux.params.game_type == "craftworld": save_model_path = '../model/' + str( task_aux.params.game_type) + '/' + task_aux.game.get_map_id( ) + '/' + str(alg_name) else: save_model_path = '../model/' + str( task_aux.params.game_type) + '/' + str(alg_name) print("Saving model to {} ...".format(save_model_path)) saver = tf.train.Saver() saver.save(sess, save_model_path) tf.reset_default_graph() sess.close() # Backing up the results json_saver.save_results() # Showing results tester.show_results() print("Time:", "%0.2f" % ((time.time() - time_init) / 60), "mins")
def run_options_save_model(alg_name, tester, curriculum, num_times, show_print): # Setting up the saver json_saver = Saver(alg_name, tester, curriculum) learning_params = tester.learning_params # Running the tasks num_times time_init = time.time() for t in range(num_times): random.seed(t) sess = tf.Session() curriculum.restart() # Creating the experience replay buffer replay_buffer, beta_schedule = create_experience_replay_buffer( learning_params.buffer_size, learning_params.prioritized_replay, learning_params.prioritized_replay_alpha, learning_params.prioritized_replay_beta0, curriculum.total_steps if learning_params.prioritized_replay_beta_iters is None else learning_params.prioritized_replay_beta_iters) options, option2file = get_options_rm(tester) curr_option_id = 0 # getting num inputs and outputs net task_aux = Game(tester.get_task_params(curriculum.get_current_task())) num_features = len(task_aux.get_features()) num_actions = len(task_aux.get_actions()) # initializing the bank of policies with one policy per option policy_bank = PolicyBankDQN(sess, num_actions, num_features, learning_params, options) # Task loop while not curriculum.stop_learning(): if show_print: print("Current step:", curriculum.get_current_step(), "from", curriculum.total_steps) rm_file = curriculum.get_next_task() # Running 'curr_option' for one episode curr_option_id = (curr_option_id + 1) % len(options) run_options_task(sess, rm_file, curr_option_id, options, option2file, policy_bank, tester, curriculum, replay_buffer, beta_schedule, show_print) # Save session if task_aux.params.game_type != "officeworld": save_model_path = '../model/' + str( task_aux.params.game_type) + '/' + task_aux.game.get_map_id( ) + '/' + str(alg_name) else: save_model_path = '../model/' + str( task_aux.params.game_type) + '/' + str(alg_name) print("Saving model to {} ...".format(save_model_path)) saver = tf.train.Saver() saver.save(sess, save_model_path) tf.reset_default_graph() sess.close() # Backing up the results json_saver.save_results() # Showing results tester.show_results() print("Time:", "%0.2f" % ((time.time() - time_init) / 60), "mins")