def __init__(self, env, rm_files): """ RM environment -------------------- It adds a set of RMs to the environment: - Every episode, the agent has to solve a different RM task - This code keeps track of the current state on the current RM task - The id of the RM state is appended to the observations - The reward given to the agent comes from the RM Parameters -------------------- - env: original environment. It must implement the following function: - get_events(...): Returns the propositions that currently hold on the environment. - rm_files: list of strings with paths to the RM files. """ super().__init__(env) # Loading the reward machines self.rm_files = rm_files self.reward_machines = [] self.num_rm_states = 0 for rm_file in rm_files: rm = RewardMachine(rm_file) self.num_rm_states += len(rm.get_states()) self.reward_machines.append(rm) self.num_rms = len(self.reward_machines) # The observation space is a dictionary including the env features and a one-hot representation of the state in the reward machine self.observation_dict = spaces.Dict({ 'features': env.observation_space, 'rm-state': spaces.Box(low=0, high=1, shape=(self.num_rm_states, ), dtype=np.uint8) }) flatdim = gym.spaces.flatdim(self.observation_dict) s_low = float(env.observation_space.low[0]) s_high = float(env.observation_space.high[0]) self.observation_space = spaces.Box(low=s_low, high=s_high, shape=(flatdim, ), dtype=np.float32) # Computing one-hot encodings for the non-terminal RM states self.rm_state_features = {} for rm_id, rm in enumerate(self.reward_machines): for u_id in rm.get_states(): u_features = np.zeros(self.num_rm_states) u_features[len(self.rm_state_features)] = 1 self.rm_state_features[(rm_id, u_id)] = u_features self.rm_done_feat = np.zeros( self.num_rm_states ) # for terminal RM states, we give as features an array of zeros # Selecting the current RM task self.current_rm_id = -1 self.current_rm = None
def step(self, action): #first check if action is valid: one sec give me a moment str_to_action = { "w": Actions.up.value, "d": Actions.right.value, "s": Actions.down.value, "a": Actions.left.value } t = "/home/adiojha629/drone_research_summer2020/officeworld_gym/gym-officeworld/gym_officeworld/envs/reward_machines/t1.txt" #directory for text file that sets up reward machine self.rm = RewardMachine(t) #create the reward machine u1 = rm.get_state() s1 = self.get_state() if action in str_to_action: self.execute_action(str_to_action[action]) events = self.get_true_propositions() #get conditions of the game u2 = rm.get_next_state(u1, events) #get the next state s2 = self.get_state() r = rm.get_reward( u1, u2, s1, action, s2) #use the reward machine to generate the rewards reward, next_state = rm.get_rewards_and_next_states( s1, a, s2, events) boolean_episode_done = self.env_game_over or rm.is_terminal_state( u2 ) #if the game is over or we're at the teriminal state then the episode is over additional_information = [] return [ next_state, reward, boolean_episode_done, additional_information ]
def load_options_model_test_composition(alg_name, tester, curriculum, num_times, new_task, show_print): learning_params = tester.learning_params for n in range(num_times): random.seed(n) sess = tf.Session() curriculum.restart() options, option2file = get_options_rm(tester) curr_option_id = 0 # getting num inputs and outputs net task_aux = Game(tester.get_task_params(curriculum.get_current_task())) num_features = len(task_aux.get_features()) num_actions = len(task_aux.get_actions()) # initializing the bank of policies with one policy per option policy_bank = PolicyBankDQN(sess, num_actions, num_features, learning_params, options) # Load the model saver = tf.train.Saver() # Get path if task_aux.params.game_type != "officeworld": save_model_path = '../model/' + str( task_aux.params.game_type) + '/' + task_aux.game.get_map_id() else: save_model_path = '../model/' + str(task_aux.params.game_type) saver.restore(sess, tf.train.latest_checkpoint(save_model_path)) reward_machines = tester.get_reward_machines() print("Loaded {} policies (options)".format( policy_bank.get_number_of_policies())) new_task_rm = RewardMachine(new_task.rm_file) linearized_plans = new_task.get_linearized_plan() print("There are {} possible linearized plans: {}".format( len(linearized_plans), linearized_plans)) least_cost = float('inf') best_policy = [] # linearized plan best_reward = 0 for i, curr_plan in enumerate(linearized_plans): cost, r_total = execute_plan_get_cost(curr_plan, tester, curriculum, options, option2file, policy_bank, new_task_rm) if cost < least_cost: least_cost = cost best_policy = curr_plan best_reward = r_total print("Rewards", best_reward) print("Steps", least_cost) print(best_policy)
def __init__(self, learning_params, testing_params, experiment, result_file=None): if result_file is None: # in this case, we are running a new experiment self.learning_params = learning_params self.testing_params = testing_params # Reading the file self.experiment = experiment f = open(experiment) lines = [l.rstrip() for l in f] f.close() # setting the right world environment self.game_type = eval(lines[0]) if self.game_type == "officeworld": self.world = TesterOfficeWorld(experiment, learning_params.gamma) if self.game_type == "craftworld": self.world = TesterCraftWorld(experiment, learning_params.tabular_case, learning_params.gamma) if self.game_type == "waterworld": self.world = TesterWaterWorld(experiment, learning_params.use_random_maps) # Creating the reward machines for each task self.reward_machines = [] self.file_to_reward_machine = {} rm_files = self.world.get_reward_machine_files() for i in range(len(rm_files)): rm_file = rm_files[i] self.file_to_reward_machine[rm_file] = i self.reward_machines.append(RewardMachine(rm_file)) # I store the results here self.results = {} self.steps = [] aux_tasks = self.get_task_specifications() for i in range(len(aux_tasks)): t_str = str(aux_tasks[i]) self.results[t_str] = {} else: # In this case, we load the results that were precomputed in a previous run data = read_json(result_file) self.game_type = data['game_type'] if self.game_type == "craftworld": self.world = TesterCraftWorld(None, None, None, data['world']) if self.game_type == "waterworld": self.world = TesterWaterWorld(None, None, data['world']) if self.game_type == "officeworld": self.world = TesterOfficeWorld(None, None, data['world']) self.results = data['results'] self.steps = data['steps'] # obs: json transform the interger keys from 'results' into strings # so I'm changing the 'steps' to strings for i in range(len(self.steps)): self.steps[i] = str(self.steps[i])
def get_options_rm(tester): # Loading options for this experiment option_folder = "../experiments/%s/options/" % tester.get_world_name() options = [ ] # NOTE: The policy bank also uses this list (in the same order) option2file = [] for option_file in _get_option_files( option_folder ): # NOTE: The option id indicates what the option does (e.g. "a&!n") option = RewardMachine(join(option_folder, option_file + ".txt")) options.append(option) option2file.append(option_file) return options, option2file
def play(params, task, max_time): from reward_machines.reward_machine import RewardMachine # commands str_to_action = { "w": Actions.up.value, "d": Actions.right.value, "s": Actions.down.value, "a": Actions.left.value } # play the game! game = CraftWorld(params) rm = RewardMachine(task) s1 = game.get_state() u1 = rm.get_initial_state() for t in range(max_time): # Showing game game.show_map() print("Events:", game.get_true_propositions()) print("Features:", game.get_features()) print("Features.shape:", game.get_features().shape) print("Features.manhattan_distance:", game._get_features_manhattan_distance()) acts = game.get_actions() # Getting action print("\nAction? ", end="") a = input() print() # Executing action if a in str_to_action and str_to_action[a] in acts: game.execute_action(str_to_action[a]) s2 = game.get_state() events = game.get_true_propositions() u2 = rm.get_next_state(u1, events) reward = rm.get_reward(u1, u2, s1, a, s2) if game.env_game_over or rm.is_terminal_state(u2): # Game Over print("Game Over") break s1, u1 = s2, u2 else: print("Forbidden action") game.show_map() return reward
def get_qrm_generalization_performance(alg_name, tester, curriculum, num_times, new_tasks, show_print): """ Testing all the tasks in new_tasks and return the success rate and cumulative reward """ sess = tf.Session() curriculum.restart() # Initialize a policy_bank graph to be loaded with saved model task_aux = Game(tester.get_task_params(curriculum.get_current_task())) num_features = len(task_aux.get_features()) num_actions = len(task_aux.get_actions()) policy_bank = PolicyBankDQN(sess, num_actions, num_features, tester.learning_params, tester.get_reward_machines()) # Load the model saver = tf.train.Saver() # Get path if task_aux.params.game_type == "craftworld": save_model_path = '../model/' + str( task_aux.params.game_type) + '/' + task_aux.game.get_map_id() else: save_model_path = '../model/' + str(task_aux.params.game_type) saver.restore(sess, tf.train.latest_checkpoint(save_model_path)) reward_machines = tester.get_reward_machines() print("Loaded {} policies (RMs)".format(len(reward_machines))) success_count = 0 all_task_rewards = [] for new_task in new_tasks: # partial-ordered RM of new task new_task_rm = RewardMachine(new_task.rm_file) linearized_plans = new_task.get_linearized_plan() print("There are {} possible linearized plans: {}".format( len(linearized_plans), linearized_plans)) least_cost = float('inf') best_policy = [ ] # list of (rm_id, state_id) corresponding to each action for i, curr_plan in enumerate(linearized_plans): # Get the least cost path for the current linearized plan cost, switching_seq = dfs_search_policy(curr_plan, tester, curriculum, new_task_rm, reward_machines, policy_bank, bound=least_cost) if cost < least_cost: print(cost, switching_seq) least_cost = cost best_policy = switching_seq # finding optimal takes too long, end early if find a solution break # Couldn't solve the task if least_cost == np.inf: print("Failed to execute this task: {}".format(new_task)) r_total = 0.0 all_task_rewards.append(r_total) continue # Execute the best policy print("Executing Best Policy...{} ({} steps)".format( best_policy, least_cost)) task = Game(tester.get_task_params(curriculum.get_current_task())) new_task_u1 = new_task_rm.get_initial_state() s1, s1_features = task.get_state_and_features() r_total = 0 curr_policy = None for t in range(int(least_cost)): if show_print: task.render() if curr_policy is None: curr_policy = best_policy.pop(0) curr_policy_rm = reward_machines[curr_policy[0]] a = policy_bank.get_best_action(curr_policy[0], curr_policy[1], s1_features.reshape( (1, num_features)), add_noise=False) task.execute_action(a) s2, s2_features = task.get_state_and_features() new_task_u2 = new_task_rm.get_next_state( new_task_u1, task.get_true_propositions()) curr_policy_u2 = curr_policy_rm.get_next_state( curr_policy[1], task.get_true_propositions()) desired_next_state = curr_policy_rm.get_next_state( curr_policy[1], curr_policy[2]) if curr_policy_u2 == desired_next_state: logger.info("EXECUTED ACTION {}, SWITCHING POLICIES".format( curr_policy[2])) curr_policy = None r = new_task_rm.get_reward(new_task_u1, new_task_u2, s1, a, s2) r_total += r * tester.learning_params.gamma**t s1, s1_features = s2, s2_features new_task_u1 = new_task_u2 if show_print: task.render() print("Rewards:", r_total) all_task_rewards.append(r_total) if r_total > 0: success_count += 1 success_rate = float(success_count) / len(new_tasks) acc_reward = sum(all_task_rewards) print(all_task_rewards) return success_rate, acc_reward
def play(): from reward_machines.reward_machine import RewardMachine # commands str_to_action = { "w": Actions.up.value, "d": Actions.right.value, "s": Actions.down.value, "a": Actions.left.value } params = OfficeWorldParams() # play the game! tasks = [ "../../experiments/office/reward_machines/t%d.txt" % i for i in [1, 2, 3, 4] ] reward_machines = [] for t in tasks: reward_machines.append(RewardMachine(t)) for i in range(len(tasks)): print("Running", tasks[i]) game = OfficeWorld(params) # setting the environment rm = reward_machines[i] # setting the reward machine s1 = game.get_state() u1 = rm.get_initial_state() while True: # Showing game game.show() print("Events:", game.get_true_propositions()) # print(game.getLTLGoal()) # Getting action print("u:", u1) print("\nAction? ", end="") a = input() print() # Executing action if a in str_to_action: game.execute_action(str_to_action[a]) # Getting new state and truth valuation s2 = game.get_state() events = game.get_true_propositions() u2 = rm.get_next_state(u1, events) r = rm.get_reward(u1, u2, s1, a, s2) # Getting rewards and next states for each reward machine rewards, next_states = [], [] for j in range(len(reward_machines)): j_rewards, j_next_states = reward_machines[ j].get_rewards_and_next_states(s1, a, s2, events) rewards.append(j_rewards) next_states.append(j_next_states) print("---------------------") print("Rewards:", rewards) print("Next States:", next_states) print("Reward:", r) print("---------------------") if game.env_game_over or rm.is_terminal_state(u2): # Game Over break s1 = s2 u1 = u2 else: print("Forbidden action") game.show() print("Events:", game.get_true_propositions())
def play(): import pygame, time from reward_machines.reward_machine import RewardMachine from tester.tester import Tester from tester.tester_params import TestingParameters from qrm.learning_params import LearningParameters # hack: moving one directory up (to keep relative references to ./src) import os os.chdir("../") tester = Tester(LearningParameters(), TestingParameters(), "../experiments/water/tests/water_7.txt") if tester is None: task = "../experiments/water/reward_machines/t1.txt" state_file = "../experiments/water/maps/world_0.pkl" max_x = 400 max_y = 400 b_num_per_color = 2 b_radius = 15 use_velocities = True ball_disappear = False params = WaterWorldParams(state_file, b_radius=b_radius, max_x=max_x, max_y=max_y, b_num_per_color=b_num_per_color, use_velocities = use_velocities, ball_disappear=ball_disappear) else: task = tester.get_task_rms()[-2] params = tester.get_task_params(task).game_params max_x, max_y = params.max_x, params.max_y game = WaterWorld(params) rm = RewardMachine(task) s1 = game.get_state() u1 = rm.get_initial_state() print("actions", game.get_actions()) pygame.init() black = (0,0,0) white = (255,255,255) colors = get_colors() gameDisplay = pygame.display.set_mode((max_x, max_y)) pygame.display.set_caption('Water world :)') clock = pygame.time.Clock() crashed = False t_previous = time.time() actions = set() while not crashed: for event in pygame.event.get(): if event.type == pygame.QUIT: crashed = True if event.type == pygame.KEYUP: if Actions.left in actions and event.key == pygame.K_LEFT: actions.remove(Actions.left) if Actions.right in actions and event.key == pygame.K_RIGHT: actions.remove(Actions.right) if Actions.up in actions and event.key == pygame.K_UP: actions.remove(Actions.up) if Actions.down in actions and event.key == pygame.K_DOWN: actions.remove(Actions.down) if event.type == pygame.KEYDOWN: if event.key == pygame.K_LEFT: actions.add(Actions.left) if event.key == pygame.K_RIGHT: actions.add(Actions.right) if event.key == pygame.K_UP: actions.add(Actions.up) if event.key == pygame.K_DOWN: actions.add(Actions.down) t_current = time.time() t_delta = (t_current - t_previous) # Getting the action if len(actions) == 0: a = Actions.none else: a = random.choice(list(actions)) # Executing the action game.execute_action(a.value, t_delta) s2 = game.get_state() events = game.get_true_propositions() u2 = rm.get_next_state(u1, events) reward = rm.get_reward(u1,u2,s1,a,s2) # printing image gameDisplay.fill(white) for b in game.balls: draw_ball(b, colors, 0, gameDisplay, pygame, max_y) draw_ball(game.agent, colors, 3, gameDisplay, pygame, max_y) pygame.display.update() clock.tick(20) # print info related to the task if reward > 0: print("REWARD!! ----------------!------------!") if rm.is_terminal_state(u2): print("Machine state:", u2, "(terminal)") else: print("Machine state:", u2) t_previous = t_current s1, u1 = s2, u2 pygame.quit()
def run_lrm(env_params, lp, rl): """ This code learns a reward machine from experience and uses dqn to learn an optimal policy for that RM: - 'env_params' is the environment parameters - 'lp' is the set of learning parameters Returns the training rewards """ # Initializing parameters and the game env = Game(env_params) rm = RewardMachine(lp.rm_u_max, lp.rm_preprocess, lp.rm_tabu_size, lp.rm_workers, lp.rm_lr_steps, env.get_perfect_rm(), lp.use_perfect_rm) actions = env.get_actions() policy = None train_rewards = [] rm_scores = [] reward_total = 0 last_reward = 0 step = 0 # Collecting random traces for learning the reward machine print("Collecting random traces...") while step < lp.rm_init_steps: # running an episode using a random policy env.restart() trace = [(env.get_events(), 0.0)] for _ in range(lp.episode_horizon): # executing a random action a = random.choice(actions) reward, done = env.execute_action(a) o2_events = env.get_events() reward_total += reward trace.append((o2_events, reward)) step += 1 # Testing if step % lp.test_freq == 0: print("Step: %d\tTrain: %0.1f" % (step, reward_total - last_reward)) train_rewards.append((step, reward_total - last_reward)) last_reward = reward_total # checking if the episode finishes if done or lp.rm_init_steps <= step: if done: rm.add_terminal_observations(o2_events) break # adding this trace to the set of traces that we use to learn the rm rm.add_trace(trace) # Learning the reward machine using the collected traces print("Learning a reward machines...") _, info = rm.learn_the_reward_machine() rm_scores.append((step, ) + info) # Start learning a policy for the current rm finish_learning = False while step < lp.train_steps and not finish_learning: env.restart() o1_events = env.get_events() o1_features = env.get_features() u1 = rm.get_initial_state() trace = [(o1_events, 0.0)] add_trace = False for _ in range(lp.episode_horizon): # reinitializing the policy if the rm changed if policy is None: print("Learning a policy for the current RM...") if rl == "dqn": policy = DQN(lp, len(o1_features), len(actions), rm) elif rl == "qrm": policy = QRM(lp, len(o1_features), len(actions), rm) else: assert False, "RL approach is not supported yet" # selecting an action using epsilon greedy a = policy.get_best_action(o1_features, u1, lp.epsilon) # executing a random action reward, done = env.execute_action(a) o2_events = env.get_events() o2_features = env.get_features() u2 = rm.get_next_state(u1, o2_events) # updating the number of steps and total reward trace.append((o2_events, reward)) reward_total += reward step += 1 # updating the current RM if needed rm.update_rewards(u1, o2_events, reward) if done: rm.add_terminal_observations(o2_events) if rm.is_observation_impossible(u1, o1_events, o2_events): # if o2 is impossible according to the current RM, # then the RM has a bug and must be relearned add_trace = True # Saving this transition policy.add_experience(o1_events, o1_features, u1, a, reward, o2_events, o2_features, u2, float(done)) # Learning and updating the target networks (if needed) policy.learn_if_needed() # Testing if step % lp.test_freq == 0: print("Step: %d\tTrain: %0.1f" % (step, reward_total - last_reward)) train_rewards.append((step, reward_total - last_reward)) last_reward = reward_total # finishing the experiment if the max number of learning steps was reached if policy._get_step() > lp.max_learning_steps: finish_learning = True # checking if the episode finishes or the agent reaches the maximum number of training steps if done or lp.train_steps <= step or finish_learning: break # Moving to the next state o1_events, o1_features, u1 = o2_events, o2_features, u2 # If the trace isn't correctly predicted by the reward machine, # we add the trace and relearn the machine if add_trace and step < lp.train_steps and not finish_learning: print("Relearning the reward machine...") rm.add_trace(trace) same_rm, info = rm.learn_the_reward_machine() rm_scores.append((step, ) + info) if not same_rm: # if the RM changed, we have to relearn all the q-values... policy.close() policy = None else: print("the new RM is not better than the current RM!!") #input() if policy is not None: policy.close() policy = None # return the trainig rewards return train_rewards, rm_scores, rm.get_info()
def rm_net_to_reward_machine(rm_net, world, strict=False): rm = RewardMachine() node2id = dict() for i, node in enumerate(rm_net.nodes()): rm.add_state(i) node2id[node] = i for node in rm_net.nodes(): # no parent, initial state if len(list(rm_net.predecessors(node))) == 0: rm.set_initial_state(node2id[node]) selfloop = ['!{}'.format(e) for e in get_all_events(world)] if strict else [] for child in rm_net.successors(node): action = rm_net.get_edge_data(node, child)['attr'] event_prop = action_to_prop(str(action), world) if event_prop in selfloop: selfloop.pop(selfloop.index(event_prop)) else: if not strict: selfloop.append('!' + str(event_prop)) reward = 0 if len(list(rm_net.successors(child))) == 0: # child is terminal, get reward 1 reward = 1 rm.add_transition(node2id[node], node2id[child], event_prop, ConstantRewardFunction(reward)) # add self loop if len(list(rm_net.successors(node))) == 0: # no children, terminal state rm.set_terminal_state(node2id[node]) else: rm.add_transition(node2id[node], node2id[node], '&'.join(selfloop), ConstantRewardFunction(0)) return rm
def get_hrl_generalization_performance(alg_name, tester, curriculum, num_times, new_tasks, show_print, use_rm): learning_params = tester.learning_params sess = tf.Session() curriculum.restart() options, option2file = get_options_rm(tester) # getting num inputs and outputs net task_aux = Game(tester.get_task_params(curriculum.get_current_task())) num_features = len(task_aux.get_features()) num_actions = len(task_aux.get_actions()) # initializing the meta controllers (one metacontroller per task) meta_controllers = [] reward_machines = tester.get_reward_machines() for i in range(len(reward_machines)): rm = reward_machines[i] num_states = len(rm.get_states()) policy_name = "Reward_Machine_%d" % i mc = MetaController(sess, policy_name, options, option2file, rm, use_rm, learning_params, num_features, num_states, show_print) meta_controllers.append(mc) # initializing the bank of policies with one policy per option policy_bank = PolicyBankDQN(sess, num_actions, num_features, learning_params, options) # Load the model saver = tf.train.Saver() # Get path if task_aux.params.game_type == "craftworld": save_model_path = '../model/' + str( task_aux.params.game_type) + '/' + task_aux.game.get_map_id() else: save_model_path = '../model/' + str(task_aux.params.game_type) saver.restore(sess, tf.train.latest_checkpoint(save_model_path)) reward_machines = tester.get_reward_machines() print("Loaded {} policies (options)".format( policy_bank.get_number_of_policies())) success_count = 0 all_task_rewards = [] for new_task in new_tasks: new_task_rm = RewardMachine(new_task.rm_file) linearized_plans = new_task.get_linearized_plan() print("There are {} possible linearized plans: {}".format( len(linearized_plans), linearized_plans)) least_cost = float('inf') best_policy = [] # linearized plan best_reward = 0 for i, curr_plan in enumerate(linearized_plans): cost, r_total = execute_plan_get_cost(curr_plan, tester, curriculum, options, option2file, policy_bank, new_task_rm) if cost < least_cost: print("Step:", cost) least_cost = cost best_policy = curr_plan best_reward = r_total if r_total > 0: success_count += 1 all_task_rewards.append(r_total) # end early if successfully finished task break if least_cost == np.inf: print("Failed to execute this task: {}".format(new_task)) continue success_rate = float(success_count) / len(new_tasks) acc_reward = sum(all_task_rewards) print(all_task_rewards) return success_rate, acc_reward
def play(): from tester.tester import Tester from tester.tester_params import TestingParameters from qrm.learning_params import LearningParameters from reward_machines.reward_machine import RewardMachine import os os.chdir("../") tester = Tester(LearningParameters(), TestingParameters(), "../experiments/mouse/tests/mouse_0.txt") task = tester.get_task_rms()[1] params = tester.get_task_params(task).game_params max_x = params.max_x max_y = params.max_y game = MouseWorld(params) rm = RewardMachine(task) s1 = game.get_state() u1 = rm.get_initial_state() pygame.init() gameDisplay = pygame.display.set_mode((max_x, max_y)) pygame.display.set_caption('Fake Keyboard') clock = pygame.time.Clock() crashed = False t_previous = time.time() actions = set() while not crashed: for event in pygame.event.get(): if event.type == pygame.QUIT: crashed = True if event.type == pygame.KEYUP: if Actions.left in actions and event.key == pygame.K_LEFT: actions.remove(Actions.left) if Actions.right in actions and event.key == pygame.K_RIGHT: actions.remove(Actions.right) if Actions.up in actions and event.key == pygame.K_UP: actions.remove(Actions.up) if Actions.down in actions and event.key == pygame.K_DOWN: actions.remove(Actions.down) if Actions.jump in actions and event.key == pygame.K_SPACE: actions.remove(Actions.jump) if event.type == pygame.KEYDOWN: if event.key == pygame.K_LEFT: actions.add(Actions.left) if event.key == pygame.K_RIGHT: actions.add(Actions.right) if event.key == pygame.K_UP: actions.add(Actions.up) if event.key == pygame.K_DOWN: actions.add(Actions.down) if event.key == pygame.K_SPACE: actions.add(Actions.jump) t_current = time.time() t_delta = (t_current - t_previous) if len(actions) == 0: a = Actions.none else: a = random.choice(list(actions)) # Executing the action game.execute_action(a.value, t_delta) s2 = game.get_state() events = game.get_true_propositions() u2 = rm.get_next_state(u1, events) reward = rm.get_reward(u1, u2, s1, a, s2) if reward > 0: print("REWARD ", reward) if rm.is_terminal_state(u2): print("Machine state:", u2, "(terminal)") else: print("Machine state:", u2) # Printing Image gameDisplay.fill(Colors.WHITE.value) for k in game.keyboard_keys: k.draw_on_display(gameDisplay) game.agent.draw_on_display(gameDisplay) game.draw_current_text_on_display(gameDisplay) pygame.display.update() clock.tick(20) t_previous = t_current s1, u1 = s2, u2 pygame.quit()
def update_hypothesis_machine(self): self.hypothesis_machine = RewardMachine(self.hypothesis_machine_file)
def load_model_and_test_composition(alg_name, tester, curriculum, num_times, new_task, show_print): """ Testing a single task (see run_new_task.py) TODO: refactor with get_qrm_generalization_performance """ for n in range(num_times): random.seed(n) sess = tf.Session() curriculum.restart() # Initialize a policy_bank graph to be loaded with saved model task_aux = Game(tester.get_task_params(curriculum.get_current_task())) num_features = len(task_aux.get_features()) num_actions = len(task_aux.get_actions()) policy_bank = PolicyBankDQN(sess, num_actions, num_features, tester.learning_params, tester.get_reward_machines()) # Load the model saver = tf.train.Saver() # Get path if task_aux.params.game_type == "craftworld": save_model_path = '../model/' + str( task_aux.params.game_type) + '/' + task_aux.game.get_map_id() else: save_model_path = '../model/' + str(task_aux.params.game_type) saver.restore(sess, tf.train.latest_checkpoint(save_model_path)) reward_machines = tester.get_reward_machines() print("Loaded {} policies (RMs)".format(len(reward_machines))) # partial-ordered RM of new task new_task_rm = RewardMachine(new_task.rm_file) linearized_plans = new_task.get_linearized_plan() print("There are {} possible linearized plans: {}".format( len(linearized_plans), linearized_plans)) least_cost = float('inf') best_policy = [ ] # list of (rm_id, state_id) corresponding to each action for i, curr_plan in enumerate(linearized_plans): # Get the least cost path for the current linearized plan # cost, switching_seq = search_policy(curr_plan, tester, curriculum, new_task_rm, reward_machines, # policy_bank, bound=least_cost) cost, switching_seq = dfs_search_policy(curr_plan, tester, curriculum, new_task_rm, reward_machines, policy_bank, bound=least_cost) if cost < least_cost: print(cost, switching_seq) least_cost = cost best_policy = switching_seq # Execute the best policy print("Executing Best Policy...{} ({} steps)".format( best_policy, least_cost)) task = Game(tester.get_task_params(curriculum.get_current_task())) new_task_u1 = new_task_rm.get_initial_state() s1, s1_features = task.get_state_and_features() r_total = 0 curr_policy = None for t in range(int(least_cost)): if show_print: task.render() if curr_policy is None: curr_policy = best_policy.pop(0) curr_policy_rm = reward_machines[curr_policy[0]] a = policy_bank.get_best_action(curr_policy[0], curr_policy[1], s1_features.reshape( (1, num_features)), add_noise=False) if show_print: print("Action:", Actions(a)) task.execute_action(a) s2, s2_features = task.get_state_and_features() new_task_u2 = new_task_rm.get_next_state( new_task_u1, task.get_true_propositions()) curr_policy_u2 = curr_policy_rm.get_next_state( curr_policy[1], task.get_true_propositions()) desired_next_state = curr_policy_rm.get_next_state( curr_policy[1], curr_policy[2]) if curr_policy_u2 == desired_next_state: logger.info("EXECUTED ACTION {}, SWITCHING POLICIES".format( curr_policy[2])) curr_policy = None r = new_task_rm.get_reward(new_task_u1, new_task_u2, s1, a, s2) r_total += r * tester.learning_params.gamma**t s1, s1_features = s2, s2_features new_task_u1 = new_task_u2 if show_print: task.render() print("Rewards:", r_total) return r_total
def compute_rm_from_graph(lm_graph, merge_init_nodes=True): """ Method 1 - Each non-init landmark corresponds to RM (with terminal state) - Edge in each RM corresponds to actions needed to take (ideally only one action for nearest landmark) - RM only reflects the necessary orderings, not partially-ordered :param lm_graph: LandmarkGraph :param merge_init_nodes: bool :return: set of RewardMachine """ if merge_init_nodes: lm_graph.merge_init_nodes() # For each landmark node that is not the initial state, create a RM for it reward_machines = set() for n_id, n in lm_graph.nodes.items(): if not n.in_init(): # initialize empty RewardMachine new_rm = RewardMachine() # populate the RewardMachine from bottom up openlist = list([n]) while len(openlist) != 0: curr_node = openlist.pop(0) # add current state new_rm.add_state_with_landmarks(n_id, copy.copy(curr_node)) # look at parent landmarks that must be achieved before current landmark, for p_id in curr_node.parents: # add a transition from parent to current reward = 0 if curr_node == n: reward = 1 new_rm.set_terminal_state(curr_node.id) new_rm.add_transition(p_id, n_id, 'TODO', ConstantRewardFunction(reward)) openlist.append(lm_graph.nodes[p_id]) if len(curr_node.parents) == 0: # this is the initial state new_rm.set_initial_state(curr_node.id) if len(curr_node.children) == 0: # this is the terminal state new_rm.set_terminal_state(curr_node.id) new_rm.get_txt_representation() reward_machines.add(new_rm) return reward_machines
def run_hrl_experiments(alg_name, tester, curriculum, num_times, show_print, use_rm): """ NOTE: To implement this baseline, we encode each option as a reward machine with one transition - use_rm: Indicates whether to prune options using the reward machine """ # Setting up the saver saver = Saver(alg_name, tester, curriculum) learning_params = tester.learning_params # Running the tasks 'num_times' time_init = time.time() for t in range(num_times): # Setting the random seed to 't' random.seed(t) sess = tf.Session() # Reseting default values curriculum.restart() # Creating the experience replay buffer replay_buffer, beta_schedule = create_experience_replay_buffer( learning_params.buffer_size, learning_params.prioritized_replay, learning_params.prioritized_replay_alpha, learning_params.prioritized_replay_beta0, curriculum.total_steps if learning_params.prioritized_replay_beta_iters is None else learning_params.prioritized_replay_beta_iters) # Loading options for this experiment option_folder = "../experiments/%s/options/" % tester.get_world_name() options = [ ] # NOTE: The policy bank also uses this list (in the same order) option2file = [] for option_file in _get_option_files( option_folder ): # NOTE: The option id indicates what the option does (e.g. "a&!n") option = RewardMachine(join(option_folder, option_file + ".txt")) options.append(option) option2file.append(option_file) # getting num inputs and outputs net task_aux = Game(tester.get_task_params(curriculum.get_current_task())) num_features = len(task_aux.get_features()) num_actions = len(task_aux.get_actions()) # initializing the meta controllers (one metacontroller per task) meta_controllers = [] reward_machines = tester.get_reward_machines() for i in range(len(reward_machines)): rm = reward_machines[i] num_states = len(rm.get_states()) policy_name = "Reward_Machine_%d" % i mc = MetaController(sess, policy_name, options, option2file, rm, use_rm, learning_params, num_features, num_states, show_print) meta_controllers.append(mc) # initializing the bank of policies with one policy per option policy_bank = PolicyBankDQN(sess, num_actions, num_features, learning_params, options) # Task loop while not curriculum.stop_learning(): if show_print: print("Current step:", curriculum.get_current_step(), "from", curriculum.total_steps) rm_file = curriculum.get_next_task() # Running 'rm_file' for one episode run_hrl_baseline(sess, rm_file, meta_controllers, options, policy_bank, tester, curriculum, replay_buffer, beta_schedule, show_print) tf.reset_default_graph() sess.close() # Backing up the results saver.save_results() # Showing results tester.show_results() print("Time:", "%0.2f" % ((time.time() - time_init) / 60), "mins")