Пример #1
0
    def __init__(self, env, rm_files):
        """
        RM environment
        --------------------
        It adds a set of RMs to the environment:
            - Every episode, the agent has to solve a different RM task
            - This code keeps track of the current state on the current RM task
            - The id of the RM state is appended to the observations
            - The reward given to the agent comes from the RM

        Parameters
        --------------------
            - env: original environment. It must implement the following function:
                - get_events(...): Returns the propositions that currently hold on the environment.
            - rm_files: list of strings with paths to the RM files.
        """
        super().__init__(env)

        # Loading the reward machines
        self.rm_files = rm_files
        self.reward_machines = []
        self.num_rm_states = 0
        for rm_file in rm_files:
            rm = RewardMachine(rm_file)
            self.num_rm_states += len(rm.get_states())
            self.reward_machines.append(rm)
        self.num_rms = len(self.reward_machines)

        # The observation space is a dictionary including the env features and a one-hot representation of the state in the reward machine
        self.observation_dict = spaces.Dict({
            'features':
            env.observation_space,
            'rm-state':
            spaces.Box(low=0,
                       high=1,
                       shape=(self.num_rm_states, ),
                       dtype=np.uint8)
        })
        flatdim = gym.spaces.flatdim(self.observation_dict)
        s_low = float(env.observation_space.low[0])
        s_high = float(env.observation_space.high[0])
        self.observation_space = spaces.Box(low=s_low,
                                            high=s_high,
                                            shape=(flatdim, ),
                                            dtype=np.float32)

        # Computing one-hot encodings for the non-terminal RM states
        self.rm_state_features = {}
        for rm_id, rm in enumerate(self.reward_machines):
            for u_id in rm.get_states():
                u_features = np.zeros(self.num_rm_states)
                u_features[len(self.rm_state_features)] = 1
                self.rm_state_features[(rm_id, u_id)] = u_features
        self.rm_done_feat = np.zeros(
            self.num_rm_states
        )  # for terminal RM states, we give as features an array of zeros

        # Selecting the current RM task
        self.current_rm_id = -1
        self.current_rm = None
Пример #2
0
    def step(self, action):
        #first check if action is valid: one sec give me a moment
        str_to_action = {
            "w": Actions.up.value,
            "d": Actions.right.value,
            "s": Actions.down.value,
            "a": Actions.left.value
        }
        t = "/home/adiojha629/drone_research_summer2020/officeworld_gym/gym-officeworld/gym_officeworld/envs/reward_machines/t1.txt"  #directory for text file that sets up reward machine
        self.rm = RewardMachine(t)  #create the reward machine
        u1 = rm.get_state()
        s1 = self.get_state()
        if action in str_to_action:
            self.execute_action(str_to_action[action])
            events = self.get_true_propositions()  #get conditions of the game
            u2 = rm.get_next_state(u1, events)  #get the next state
            s2 = self.get_state()
            r = rm.get_reward(
                u1, u2, s1, action,
                s2)  #use the reward machine to generate the rewards
            reward, next_state = rm.get_rewards_and_next_states(
                s1, a, s2, events)

            boolean_episode_done = self.env_game_over or rm.is_terminal_state(
                u2
            )  #if the game is over or we're at the teriminal state then the episode is over
        additional_information = []

        return [
            next_state, reward, boolean_episode_done, additional_information
        ]
Пример #3
0
def load_options_model_test_composition(alg_name, tester, curriculum,
                                        num_times, new_task, show_print):
    learning_params = tester.learning_params

    for n in range(num_times):
        random.seed(n)
        sess = tf.Session()
        curriculum.restart()

        options, option2file = get_options_rm(tester)
        curr_option_id = 0
        # getting num inputs and outputs net
        task_aux = Game(tester.get_task_params(curriculum.get_current_task()))
        num_features = len(task_aux.get_features())
        num_actions = len(task_aux.get_actions())

        # initializing the bank of policies with one policy per option
        policy_bank = PolicyBankDQN(sess, num_actions, num_features,
                                    learning_params, options)

        # Load the model
        saver = tf.train.Saver()

        # Get path
        if task_aux.params.game_type != "officeworld":
            save_model_path = '../model/' + str(
                task_aux.params.game_type) + '/' + task_aux.game.get_map_id()
        else:
            save_model_path = '../model/' + str(task_aux.params.game_type)
        saver.restore(sess, tf.train.latest_checkpoint(save_model_path))

        reward_machines = tester.get_reward_machines()
        print("Loaded {} policies (options)".format(
            policy_bank.get_number_of_policies()))

        new_task_rm = RewardMachine(new_task.rm_file)
        linearized_plans = new_task.get_linearized_plan()
        print("There are {} possible linearized plans: {}".format(
            len(linearized_plans), linearized_plans))

        least_cost = float('inf')
        best_policy = []  # linearized plan
        best_reward = 0
        for i, curr_plan in enumerate(linearized_plans):
            cost, r_total = execute_plan_get_cost(curr_plan, tester,
                                                  curriculum, options,
                                                  option2file, policy_bank,
                                                  new_task_rm)
            if cost < least_cost:
                least_cost = cost
                best_policy = curr_plan
                best_reward = r_total

        print("Rewards", best_reward)
        print("Steps", least_cost)
        print(best_policy)
Пример #4
0
    def __init__(self, learning_params, testing_params, experiment, result_file=None):
        if result_file is None: # in this case, we are running a new experiment
            self.learning_params = learning_params
            self.testing_params = testing_params
            # Reading the file
            self.experiment = experiment
            f = open(experiment)
            lines = [l.rstrip() for l in f]
            f.close()

            # setting the right world environment
            self.game_type = eval(lines[0])
            if self.game_type == "officeworld":
                self.world = TesterOfficeWorld(experiment, learning_params.gamma)
            if self.game_type == "craftworld":
                self.world = TesterCraftWorld(experiment, learning_params.tabular_case, learning_params.gamma)
            if self.game_type == "waterworld":
                self.world = TesterWaterWorld(experiment, learning_params.use_random_maps)

            # Creating the reward machines for each task
            self.reward_machines = []
            self.file_to_reward_machine = {}
            rm_files = self.world.get_reward_machine_files()
            for i in range(len(rm_files)):
                rm_file = rm_files[i]
                self.file_to_reward_machine[rm_file] = i
                self.reward_machines.append(RewardMachine(rm_file))

            # I store the results here
            self.results = {}
            self.steps = []
            aux_tasks = self.get_task_specifications()
            for i in range(len(aux_tasks)):
                t_str = str(aux_tasks[i])
                self.results[t_str] = {}

        else:
            # In this case, we load the results that were precomputed in a previous run
            data = read_json(result_file)
            self.game_type = data['game_type']
            if self.game_type == "craftworld":
                self.world = TesterCraftWorld(None, None, None, data['world'])
            if self.game_type == "waterworld":
                self.world = TesterWaterWorld(None, None, data['world'])
            if self.game_type == "officeworld":
                self.world = TesterOfficeWorld(None, None, data['world'])

            self.results = data['results']
            self.steps   = data['steps']            
            # obs: json transform the interger keys from 'results' into strings
            # so I'm changing the 'steps' to strings
            for i in range(len(self.steps)):
                self.steps[i] = str(self.steps[i])
Пример #5
0
def get_options_rm(tester):
    # Loading options for this experiment
    option_folder = "../experiments/%s/options/" % tester.get_world_name()

    options = [
    ]  # NOTE: The policy bank also uses this list (in the same order)
    option2file = []
    for option_file in _get_option_files(
            option_folder
    ):  # NOTE: The option id indicates what the option does (e.g. "a&!n")
        option = RewardMachine(join(option_folder, option_file + ".txt"))
        options.append(option)
        option2file.append(option_file)

    return options, option2file
Пример #6
0
def play(params, task, max_time):
    from reward_machines.reward_machine import RewardMachine

    # commands
    str_to_action = {
        "w": Actions.up.value,
        "d": Actions.right.value,
        "s": Actions.down.value,
        "a": Actions.left.value
    }
    # play the game!
    game = CraftWorld(params)
    rm = RewardMachine(task)
    s1 = game.get_state()
    u1 = rm.get_initial_state()
    for t in range(max_time):
        # Showing game
        game.show_map()
        print("Events:", game.get_true_propositions())
        print("Features:", game.get_features())
        print("Features.shape:", game.get_features().shape)
        print("Features.manhattan_distance:",
              game._get_features_manhattan_distance())
        acts = game.get_actions()
        # Getting action
        print("\nAction? ", end="")
        a = input()
        print()
        # Executing action
        if a in str_to_action and str_to_action[a] in acts:
            game.execute_action(str_to_action[a])

            s2 = game.get_state()
            events = game.get_true_propositions()
            u2 = rm.get_next_state(u1, events)
            reward = rm.get_reward(u1, u2, s1, a, s2)

            if game.env_game_over or rm.is_terminal_state(u2):  # Game Over
                print("Game Over")
                break

            s1, u1 = s2, u2
        else:
            print("Forbidden action")
    game.show_map()
    return reward
Пример #7
0
def get_qrm_generalization_performance(alg_name, tester, curriculum, num_times,
                                       new_tasks, show_print):
    """
    Testing all the tasks in new_tasks and return the success rate and cumulative reward
    """

    sess = tf.Session()
    curriculum.restart()
    # Initialize a policy_bank graph to be loaded with saved model
    task_aux = Game(tester.get_task_params(curriculum.get_current_task()))
    num_features = len(task_aux.get_features())
    num_actions = len(task_aux.get_actions())
    policy_bank = PolicyBankDQN(sess, num_actions, num_features,
                                tester.learning_params,
                                tester.get_reward_machines())

    # Load the model
    saver = tf.train.Saver()

    # Get path
    if task_aux.params.game_type == "craftworld":
        save_model_path = '../model/' + str(
            task_aux.params.game_type) + '/' + task_aux.game.get_map_id()
    else:
        save_model_path = '../model/' + str(task_aux.params.game_type)

    saver.restore(sess, tf.train.latest_checkpoint(save_model_path))

    reward_machines = tester.get_reward_machines()
    print("Loaded {} policies (RMs)".format(len(reward_machines)))

    success_count = 0
    all_task_rewards = []

    for new_task in new_tasks:
        # partial-ordered RM of new task
        new_task_rm = RewardMachine(new_task.rm_file)
        linearized_plans = new_task.get_linearized_plan()
        print("There are {} possible linearized plans: {}".format(
            len(linearized_plans), linearized_plans))
        least_cost = float('inf')
        best_policy = [
        ]  # list of (rm_id, state_id) corresponding to each action

        for i, curr_plan in enumerate(linearized_plans):
            # Get the least cost path for the current linearized plan
            cost, switching_seq = dfs_search_policy(curr_plan,
                                                    tester,
                                                    curriculum,
                                                    new_task_rm,
                                                    reward_machines,
                                                    policy_bank,
                                                    bound=least_cost)
            if cost < least_cost:
                print(cost, switching_seq)
                least_cost = cost
                best_policy = switching_seq
                # finding optimal takes too long, end early if find a solution
                break

        # Couldn't solve the task
        if least_cost == np.inf:
            print("Failed to execute this task: {}".format(new_task))
            r_total = 0.0
            all_task_rewards.append(r_total)
            continue

        # Execute the best policy
        print("Executing Best Policy...{} ({} steps)".format(
            best_policy, least_cost))
        task = Game(tester.get_task_params(curriculum.get_current_task()))
        new_task_u1 = new_task_rm.get_initial_state()
        s1, s1_features = task.get_state_and_features()
        r_total = 0
        curr_policy = None

        for t in range(int(least_cost)):
            if show_print:
                task.render()
            if curr_policy is None:
                curr_policy = best_policy.pop(0)
            curr_policy_rm = reward_machines[curr_policy[0]]

            a = policy_bank.get_best_action(curr_policy[0],
                                            curr_policy[1],
                                            s1_features.reshape(
                                                (1, num_features)),
                                            add_noise=False)
            task.execute_action(a)

            s2, s2_features = task.get_state_and_features()
            new_task_u2 = new_task_rm.get_next_state(
                new_task_u1, task.get_true_propositions())

            curr_policy_u2 = curr_policy_rm.get_next_state(
                curr_policy[1], task.get_true_propositions())
            desired_next_state = curr_policy_rm.get_next_state(
                curr_policy[1], curr_policy[2])
            if curr_policy_u2 == desired_next_state:
                logger.info("EXECUTED ACTION {}, SWITCHING POLICIES".format(
                    curr_policy[2]))
                curr_policy = None

            r = new_task_rm.get_reward(new_task_u1, new_task_u2, s1, a, s2)
            r_total += r * tester.learning_params.gamma**t

            s1, s1_features = s2, s2_features
            new_task_u1 = new_task_u2
        if show_print:
            task.render()
        print("Rewards:", r_total)

        all_task_rewards.append(r_total)
        if r_total > 0:
            success_count += 1

    success_rate = float(success_count) / len(new_tasks)
    acc_reward = sum(all_task_rewards)
    print(all_task_rewards)
    return success_rate, acc_reward
Пример #8
0
def play():
    from reward_machines.reward_machine import RewardMachine

    # commands
    str_to_action = {
        "w": Actions.up.value,
        "d": Actions.right.value,
        "s": Actions.down.value,
        "a": Actions.left.value
    }
    params = OfficeWorldParams()

    # play the game!
    tasks = [
        "../../experiments/office/reward_machines/t%d.txt" % i
        for i in [1, 2, 3, 4]
    ]
    reward_machines = []
    for t in tasks:
        reward_machines.append(RewardMachine(t))
    for i in range(len(tasks)):
        print("Running", tasks[i])

        game = OfficeWorld(params)  # setting the environment
        rm = reward_machines[i]  # setting the reward machine
        s1 = game.get_state()
        u1 = rm.get_initial_state()
        while True:
            # Showing game
            game.show()
            print("Events:", game.get_true_propositions())
            # print(game.getLTLGoal())
            # Getting action
            print("u:", u1)
            print("\nAction? ", end="")
            a = input()
            print()
            # Executing action
            if a in str_to_action:
                game.execute_action(str_to_action[a])

                # Getting new state and truth valuation
                s2 = game.get_state()
                events = game.get_true_propositions()
                u2 = rm.get_next_state(u1, events)
                r = rm.get_reward(u1, u2, s1, a, s2)

                # Getting rewards and next states for each reward machine
                rewards, next_states = [], []
                for j in range(len(reward_machines)):
                    j_rewards, j_next_states = reward_machines[
                        j].get_rewards_and_next_states(s1, a, s2, events)
                    rewards.append(j_rewards)
                    next_states.append(j_next_states)

                print("---------------------")
                print("Rewards:", rewards)
                print("Next States:", next_states)
                print("Reward:", r)
                print("---------------------")

                if game.env_game_over or rm.is_terminal_state(u2):  # Game Over
                    break

                s1 = s2
                u1 = u2
            else:
                print("Forbidden action")
        game.show()
        print("Events:", game.get_true_propositions())
Пример #9
0
def play():
    import pygame, time
    from reward_machines.reward_machine import RewardMachine

    from tester.tester import Tester
    from tester.tester_params import TestingParameters    
    from qrm.learning_params import LearningParameters

    # hack: moving one directory up (to keep relative references to ./src)
    import os
    os.chdir("../")

    tester = Tester(LearningParameters(), TestingParameters(), "../experiments/water/tests/water_7.txt")
    if tester is None:
        task = "../experiments/water/reward_machines/t1.txt"
        state_file = "../experiments/water/maps/world_0.pkl"
        max_x = 400
        max_y = 400
        b_num_per_color = 2
        b_radius = 15
        use_velocities = True
        ball_disappear = False

        params = WaterWorldParams(state_file, b_radius=b_radius, max_x=max_x, max_y=max_y, 
                                  b_num_per_color=b_num_per_color, use_velocities = use_velocities, 
                                  ball_disappear=ball_disappear)
    else:
        task   = tester.get_task_rms()[-2]
        params = tester.get_task_params(task).game_params

    max_x, max_y = params.max_x, params.max_y

    game = WaterWorld(params)    
    rm = RewardMachine(task) 
    s1 = game.get_state()
    u1 = rm.get_initial_state()

    print("actions", game.get_actions())

    pygame.init()
    
    black = (0,0,0)
    white = (255,255,255)
    colors = get_colors()
    
    gameDisplay = pygame.display.set_mode((max_x, max_y))
    pygame.display.set_caption('Water world :)')
    clock = pygame.time.Clock()
    crashed = False

    t_previous = time.time()
    actions = set()
    while not crashed:
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                crashed = True
            if event.type == pygame.KEYUP:
                if Actions.left in actions and event.key == pygame.K_LEFT:
                    actions.remove(Actions.left)
                if Actions.right in actions and event.key == pygame.K_RIGHT:
                    actions.remove(Actions.right)
                if Actions.up in actions and event.key == pygame.K_UP:
                    actions.remove(Actions.up)
                if Actions.down in actions and event.key == pygame.K_DOWN:
                    actions.remove(Actions.down)
            if event.type == pygame.KEYDOWN:
                if event.key == pygame.K_LEFT:
                    actions.add(Actions.left)
                if event.key == pygame.K_RIGHT:
                    actions.add(Actions.right)
                if event.key == pygame.K_UP:
                    actions.add(Actions.up)
                if event.key == pygame.K_DOWN:
                    actions.add(Actions.down)
            

        t_current = time.time()
        t_delta = (t_current - t_previous)

        # Getting the action
        if len(actions) == 0: a = Actions.none
        else: a = random.choice(list(actions))

        # Executing the action
        game.execute_action(a.value, t_delta)

        s2 = game.get_state()
        events = game.get_true_propositions()
        u2 = rm.get_next_state(u1, events)
        reward = rm.get_reward(u1,u2,s1,a,s2)

        # printing image
        gameDisplay.fill(white)
        for b in game.balls:
            draw_ball(b, colors, 0, gameDisplay, pygame, max_y)
        draw_ball(game.agent, colors, 3, gameDisplay, pygame, max_y)
        pygame.display.update()
        clock.tick(20)

        # print info related to the task
        if reward > 0: print("REWARD!! ----------------!------------!")
        if rm.is_terminal_state(u2): 
            print("Machine state:", u2, "(terminal)")
        else:
            print("Machine state:", u2)

        t_previous = t_current
        s1, u1 = s2, u2

    pygame.quit()
Пример #10
0
def run_lrm(env_params, lp, rl):
    """
    This code learns a reward machine from experience and uses dqn to learn an optimal policy for that RM:
        - 'env_params' is the environment parameters
        - 'lp' is the set of learning parameters
    Returns the training rewards
    """
    # Initializing parameters and the game
    env = Game(env_params)
    rm = RewardMachine(lp.rm_u_max, lp.rm_preprocess,
                       lp.rm_tabu_size, lp.rm_workers, lp.rm_lr_steps,
                       env.get_perfect_rm(), lp.use_perfect_rm)
    actions = env.get_actions()
    policy = None
    train_rewards = []
    rm_scores = []
    reward_total = 0
    last_reward = 0
    step = 0

    # Collecting random traces for learning the reward machine
    print("Collecting random traces...")
    while step < lp.rm_init_steps:
        # running an episode using a random policy
        env.restart()
        trace = [(env.get_events(), 0.0)]
        for _ in range(lp.episode_horizon):
            # executing a random action
            a = random.choice(actions)
            reward, done = env.execute_action(a)
            o2_events = env.get_events()
            reward_total += reward
            trace.append((o2_events, reward))
            step += 1
            # Testing
            if step % lp.test_freq == 0:
                print("Step: %d\tTrain: %0.1f" %
                      (step, reward_total - last_reward))
                train_rewards.append((step, reward_total - last_reward))
                last_reward = reward_total
            # checking if the episode finishes
            if done or lp.rm_init_steps <= step:
                if done: rm.add_terminal_observations(o2_events)
                break
        # adding this trace to the set of traces that we use to learn the rm
        rm.add_trace(trace)

    # Learning the reward machine using the collected traces
    print("Learning a reward machines...")
    _, info = rm.learn_the_reward_machine()
    rm_scores.append((step, ) + info)

    # Start learning a policy for the current rm
    finish_learning = False
    while step < lp.train_steps and not finish_learning:
        env.restart()
        o1_events = env.get_events()
        o1_features = env.get_features()
        u1 = rm.get_initial_state()
        trace = [(o1_events, 0.0)]
        add_trace = False

        for _ in range(lp.episode_horizon):

            # reinitializing the policy if the rm changed
            if policy is None:
                print("Learning a policy for the current RM...")
                if rl == "dqn":
                    policy = DQN(lp, len(o1_features), len(actions), rm)
                elif rl == "qrm":
                    policy = QRM(lp, len(o1_features), len(actions), rm)
                else:
                    assert False, "RL approach is not supported yet"

            # selecting an action using epsilon greedy
            a = policy.get_best_action(o1_features, u1, lp.epsilon)

            # executing a random action
            reward, done = env.execute_action(a)
            o2_events = env.get_events()
            o2_features = env.get_features()
            u2 = rm.get_next_state(u1, o2_events)

            # updating the number of steps and total reward
            trace.append((o2_events, reward))
            reward_total += reward
            step += 1

            # updating the current RM if needed
            rm.update_rewards(u1, o2_events, reward)
            if done: rm.add_terminal_observations(o2_events)
            if rm.is_observation_impossible(u1, o1_events, o2_events):
                # if o2 is impossible according to the current RM,
                # then the RM has a bug and must be relearned
                add_trace = True

            # Saving this transition
            policy.add_experience(o1_events, o1_features, u1, a, reward,
                                  o2_events, o2_features, u2, float(done))

            # Learning and updating the target networks (if needed)
            policy.learn_if_needed()

            # Testing
            if step % lp.test_freq == 0:
                print("Step: %d\tTrain: %0.1f" %
                      (step, reward_total - last_reward))
                train_rewards.append((step, reward_total - last_reward))
                last_reward = reward_total
                # finishing the experiment if the max number of learning steps was reached
                if policy._get_step() > lp.max_learning_steps:
                    finish_learning = True

            # checking if the episode finishes or the agent reaches the maximum number of training steps
            if done or lp.train_steps <= step or finish_learning:
                break

            # Moving to the next state
            o1_events, o1_features, u1 = o2_events, o2_features, u2

        # If the trace isn't correctly predicted by the reward machine,
        # we add the trace and relearn the machine
        if add_trace and step < lp.train_steps and not finish_learning:
            print("Relearning the reward machine...")
            rm.add_trace(trace)
            same_rm, info = rm.learn_the_reward_machine()
            rm_scores.append((step, ) + info)
            if not same_rm:
                # if the RM changed, we have to relearn all the q-values...
                policy.close()
                policy = None
            else:
                print("the new RM is not better than the current RM!!")
                #input()

    if policy is not None:
        policy.close()
        policy = None

    # return the trainig rewards
    return train_rewards, rm_scores, rm.get_info()
Пример #11
0
def rm_net_to_reward_machine(rm_net, world, strict=False):
    rm = RewardMachine()
    node2id = dict()
    for i, node in enumerate(rm_net.nodes()):
        rm.add_state(i)
        node2id[node] = i

    for node in rm_net.nodes():
        # no parent, initial state
        if len(list(rm_net.predecessors(node))) == 0:
            rm.set_initial_state(node2id[node])

        selfloop = ['!{}'.format(e)
                    for e in get_all_events(world)] if strict else []
        for child in rm_net.successors(node):
            action = rm_net.get_edge_data(node, child)['attr']
            event_prop = action_to_prop(str(action), world)
            if event_prop in selfloop:
                selfloop.pop(selfloop.index(event_prop))
            else:
                if not strict:
                    selfloop.append('!' + str(event_prop))
            reward = 0
            if len(list(rm_net.successors(child))) == 0:
                # child is terminal, get reward 1
                reward = 1
            rm.add_transition(node2id[node], node2id[child], event_prop,
                              ConstantRewardFunction(reward))

        # add self loop
        if len(list(rm_net.successors(node))) == 0:
            # no children, terminal state
            rm.set_terminal_state(node2id[node])
        else:
            rm.add_transition(node2id[node], node2id[node], '&'.join(selfloop),
                              ConstantRewardFunction(0))

    return rm
Пример #12
0
def get_hrl_generalization_performance(alg_name, tester, curriculum, num_times,
                                       new_tasks, show_print, use_rm):
    learning_params = tester.learning_params

    sess = tf.Session()
    curriculum.restart()

    options, option2file = get_options_rm(tester)

    # getting num inputs and outputs net
    task_aux = Game(tester.get_task_params(curriculum.get_current_task()))
    num_features = len(task_aux.get_features())
    num_actions = len(task_aux.get_actions())

    # initializing the meta controllers (one metacontroller per task)
    meta_controllers = []
    reward_machines = tester.get_reward_machines()
    for i in range(len(reward_machines)):
        rm = reward_machines[i]
        num_states = len(rm.get_states())
        policy_name = "Reward_Machine_%d" % i
        mc = MetaController(sess, policy_name, options, option2file, rm,
                            use_rm, learning_params, num_features, num_states,
                            show_print)
        meta_controllers.append(mc)

    # initializing the bank of policies with one policy per option
    policy_bank = PolicyBankDQN(sess, num_actions, num_features,
                                learning_params, options)

    # Load the model
    saver = tf.train.Saver()
    # Get path
    if task_aux.params.game_type == "craftworld":
        save_model_path = '../model/' + str(
            task_aux.params.game_type) + '/' + task_aux.game.get_map_id()
    else:
        save_model_path = '../model/' + str(task_aux.params.game_type)
    saver.restore(sess, tf.train.latest_checkpoint(save_model_path))

    reward_machines = tester.get_reward_machines()
    print("Loaded {} policies (options)".format(
        policy_bank.get_number_of_policies()))

    success_count = 0
    all_task_rewards = []

    for new_task in new_tasks:
        new_task_rm = RewardMachine(new_task.rm_file)
        linearized_plans = new_task.get_linearized_plan()
        print("There are {} possible linearized plans: {}".format(
            len(linearized_plans), linearized_plans))

        least_cost = float('inf')
        best_policy = []  # linearized plan
        best_reward = 0
        for i, curr_plan in enumerate(linearized_plans):
            cost, r_total = execute_plan_get_cost(curr_plan, tester,
                                                  curriculum, options,
                                                  option2file, policy_bank,
                                                  new_task_rm)
            if cost < least_cost:
                print("Step:", cost)
                least_cost = cost
                best_policy = curr_plan
                best_reward = r_total
                if r_total > 0:
                    success_count += 1
                    all_task_rewards.append(r_total)
                # end early if successfully finished task
                break

        if least_cost == np.inf:
            print("Failed to execute this task: {}".format(new_task))
            continue

    success_rate = float(success_count) / len(new_tasks)
    acc_reward = sum(all_task_rewards)
    print(all_task_rewards)
    return success_rate, acc_reward
Пример #13
0
def play():
    from tester.tester import Tester
    from tester.tester_params import TestingParameters
    from qrm.learning_params import LearningParameters
    from reward_machines.reward_machine import RewardMachine

    import os
    os.chdir("../")
    tester = Tester(LearningParameters(), TestingParameters(),
                    "../experiments/mouse/tests/mouse_0.txt")

    task = tester.get_task_rms()[1]
    params = tester.get_task_params(task).game_params
    max_x = params.max_x
    max_y = params.max_y
    game = MouseWorld(params)
    rm = RewardMachine(task)
    s1 = game.get_state()
    u1 = rm.get_initial_state()

    pygame.init()
    gameDisplay = pygame.display.set_mode((max_x, max_y))
    pygame.display.set_caption('Fake Keyboard')
    clock = pygame.time.Clock()
    crashed = False

    t_previous = time.time()
    actions = set()
    while not crashed:
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                crashed = True

            if event.type == pygame.KEYUP:
                if Actions.left in actions and event.key == pygame.K_LEFT:
                    actions.remove(Actions.left)
                if Actions.right in actions and event.key == pygame.K_RIGHT:
                    actions.remove(Actions.right)
                if Actions.up in actions and event.key == pygame.K_UP:
                    actions.remove(Actions.up)
                if Actions.down in actions and event.key == pygame.K_DOWN:
                    actions.remove(Actions.down)
                if Actions.jump in actions and event.key == pygame.K_SPACE:
                    actions.remove(Actions.jump)
            if event.type == pygame.KEYDOWN:
                if event.key == pygame.K_LEFT:
                    actions.add(Actions.left)
                if event.key == pygame.K_RIGHT:
                    actions.add(Actions.right)
                if event.key == pygame.K_UP:
                    actions.add(Actions.up)
                if event.key == pygame.K_DOWN:
                    actions.add(Actions.down)
                if event.key == pygame.K_SPACE:
                    actions.add(Actions.jump)

        t_current = time.time()
        t_delta = (t_current - t_previous)

        if len(actions) == 0:
            a = Actions.none
        else:
            a = random.choice(list(actions))

        # Executing the action
        game.execute_action(a.value, t_delta)

        s2 = game.get_state()
        events = game.get_true_propositions()
        u2 = rm.get_next_state(u1, events)
        reward = rm.get_reward(u1, u2, s1, a, s2)

        if reward > 0:
            print("REWARD ", reward)
        if rm.is_terminal_state(u2):
            print("Machine state:", u2, "(terminal)")
        else:
            print("Machine state:", u2)

        # Printing Image
        gameDisplay.fill(Colors.WHITE.value)
        for k in game.keyboard_keys:
            k.draw_on_display(gameDisplay)
        game.agent.draw_on_display(gameDisplay)
        game.draw_current_text_on_display(gameDisplay)

        pygame.display.update()
        clock.tick(20)

        t_previous = t_current
        s1, u1 = s2, u2

    pygame.quit()
Пример #14
0
 def update_hypothesis_machine(self):
     self.hypothesis_machine = RewardMachine(self.hypothesis_machine_file)
Пример #15
0
def load_model_and_test_composition(alg_name, tester, curriculum, num_times,
                                    new_task, show_print):
    """
    Testing a single task (see run_new_task.py)
    TODO: refactor with get_qrm_generalization_performance
    """
    for n in range(num_times):
        random.seed(n)
        sess = tf.Session()

        curriculum.restart()

        # Initialize a policy_bank graph to be loaded with saved model
        task_aux = Game(tester.get_task_params(curriculum.get_current_task()))
        num_features = len(task_aux.get_features())
        num_actions = len(task_aux.get_actions())
        policy_bank = PolicyBankDQN(sess, num_actions, num_features,
                                    tester.learning_params,
                                    tester.get_reward_machines())

        # Load the model
        saver = tf.train.Saver()

        # Get path
        if task_aux.params.game_type == "craftworld":
            save_model_path = '../model/' + str(
                task_aux.params.game_type) + '/' + task_aux.game.get_map_id()
        else:
            save_model_path = '../model/' + str(task_aux.params.game_type)

        saver.restore(sess, tf.train.latest_checkpoint(save_model_path))

        reward_machines = tester.get_reward_machines()
        print("Loaded {} policies (RMs)".format(len(reward_machines)))

        # partial-ordered RM of new task
        new_task_rm = RewardMachine(new_task.rm_file)
        linearized_plans = new_task.get_linearized_plan()
        print("There are {} possible linearized plans: {}".format(
            len(linearized_plans), linearized_plans))
        least_cost = float('inf')
        best_policy = [
        ]  # list of (rm_id, state_id) corresponding to each action

        for i, curr_plan in enumerate(linearized_plans):
            # Get the least cost path for the current linearized plan
            # cost, switching_seq = search_policy(curr_plan, tester, curriculum, new_task_rm, reward_machines,
            #                                     policy_bank, bound=least_cost)
            cost, switching_seq = dfs_search_policy(curr_plan,
                                                    tester,
                                                    curriculum,
                                                    new_task_rm,
                                                    reward_machines,
                                                    policy_bank,
                                                    bound=least_cost)
            if cost < least_cost:
                print(cost, switching_seq)
                least_cost = cost
                best_policy = switching_seq

        # Execute the best policy
        print("Executing Best Policy...{} ({} steps)".format(
            best_policy, least_cost))
        task = Game(tester.get_task_params(curriculum.get_current_task()))
        new_task_u1 = new_task_rm.get_initial_state()
        s1, s1_features = task.get_state_and_features()
        r_total = 0
        curr_policy = None
        for t in range(int(least_cost)):
            if show_print:
                task.render()
            if curr_policy is None:
                curr_policy = best_policy.pop(0)
            curr_policy_rm = reward_machines[curr_policy[0]]

            a = policy_bank.get_best_action(curr_policy[0],
                                            curr_policy[1],
                                            s1_features.reshape(
                                                (1, num_features)),
                                            add_noise=False)
            if show_print: print("Action:", Actions(a))
            task.execute_action(a)

            s2, s2_features = task.get_state_and_features()
            new_task_u2 = new_task_rm.get_next_state(
                new_task_u1, task.get_true_propositions())

            curr_policy_u2 = curr_policy_rm.get_next_state(
                curr_policy[1], task.get_true_propositions())
            desired_next_state = curr_policy_rm.get_next_state(
                curr_policy[1], curr_policy[2])
            if curr_policy_u2 == desired_next_state:
                logger.info("EXECUTED ACTION {}, SWITCHING POLICIES".format(
                    curr_policy[2]))
                curr_policy = None

            r = new_task_rm.get_reward(new_task_u1, new_task_u2, s1, a, s2)
            r_total += r * tester.learning_params.gamma**t

            s1, s1_features = s2, s2_features
            new_task_u1 = new_task_u2
        if show_print:
            task.render()
        print("Rewards:", r_total)

        return r_total
Пример #16
0
def compute_rm_from_graph(lm_graph, merge_init_nodes=True):
    """
    Method 1
    - Each non-init landmark corresponds to RM (with terminal state)
    - Edge in each RM corresponds to actions needed to take (ideally only one action for nearest landmark)
    - RM only reflects the necessary orderings, not partially-ordered

    :param lm_graph: LandmarkGraph
    :param merge_init_nodes: bool
    :return: set of RewardMachine
    """
    if merge_init_nodes:
        lm_graph.merge_init_nodes()

    # For each landmark node that is not the initial state, create a RM for it
    reward_machines = set()
    for n_id, n in lm_graph.nodes.items():
        if not n.in_init():
            # initialize empty RewardMachine
            new_rm = RewardMachine()
            # populate the RewardMachine from bottom up
            openlist = list([n])
            while len(openlist) != 0:
                curr_node = openlist.pop(0)
                # add current state
                new_rm.add_state_with_landmarks(n_id, copy.copy(curr_node))

                # look at parent landmarks that must be achieved before current landmark,
                for p_id in curr_node.parents:
                    # add a transition from parent to current
                    reward = 0
                    if curr_node == n:
                        reward = 1
                        new_rm.set_terminal_state(curr_node.id)

                    new_rm.add_transition(p_id, n_id, 'TODO',
                                          ConstantRewardFunction(reward))
                    openlist.append(lm_graph.nodes[p_id])

                if len(curr_node.parents) == 0:
                    # this is the initial state
                    new_rm.set_initial_state(curr_node.id)

                if len(curr_node.children) == 0:
                    # this is the terminal state
                    new_rm.set_terminal_state(curr_node.id)

            new_rm.get_txt_representation()
            reward_machines.add(new_rm)

    return reward_machines
Пример #17
0
def run_hrl_experiments(alg_name, tester, curriculum, num_times, show_print,
                        use_rm):
    """
        NOTE: To implement this baseline, we encode each option as a reward machine with one transition
        - use_rm: Indicates whether to prune options using the reward machine
    """

    # Setting up the saver
    saver = Saver(alg_name, tester, curriculum)
    learning_params = tester.learning_params

    # Running the tasks 'num_times'
    time_init = time.time()
    for t in range(num_times):

        # Setting the random seed to 't'
        random.seed(t)
        sess = tf.Session()

        # Reseting default values
        curriculum.restart()

        # Creating the experience replay buffer
        replay_buffer, beta_schedule = create_experience_replay_buffer(
            learning_params.buffer_size, learning_params.prioritized_replay,
            learning_params.prioritized_replay_alpha,
            learning_params.prioritized_replay_beta0, curriculum.total_steps
            if learning_params.prioritized_replay_beta_iters is None else
            learning_params.prioritized_replay_beta_iters)

        # Loading options for this experiment
        option_folder = "../experiments/%s/options/" % tester.get_world_name()

        options = [
        ]  # NOTE: The policy bank also uses this list (in the same order)
        option2file = []
        for option_file in _get_option_files(
                option_folder
        ):  # NOTE: The option id indicates what the option does (e.g. "a&!n")
            option = RewardMachine(join(option_folder, option_file + ".txt"))
            options.append(option)
            option2file.append(option_file)

        # getting num inputs and outputs net
        task_aux = Game(tester.get_task_params(curriculum.get_current_task()))
        num_features = len(task_aux.get_features())
        num_actions = len(task_aux.get_actions())

        # initializing the meta controllers (one metacontroller per task)
        meta_controllers = []
        reward_machines = tester.get_reward_machines()
        for i in range(len(reward_machines)):
            rm = reward_machines[i]
            num_states = len(rm.get_states())
            policy_name = "Reward_Machine_%d" % i
            mc = MetaController(sess, policy_name, options, option2file, rm,
                                use_rm, learning_params, num_features,
                                num_states, show_print)
            meta_controllers.append(mc)

        # initializing the bank of policies with one policy per option
        policy_bank = PolicyBankDQN(sess, num_actions, num_features,
                                    learning_params, options)

        # Task loop
        while not curriculum.stop_learning():
            if show_print:
                print("Current step:", curriculum.get_current_step(), "from",
                      curriculum.total_steps)
            rm_file = curriculum.get_next_task()

            # Running 'rm_file' for one episode
            run_hrl_baseline(sess, rm_file, meta_controllers, options,
                             policy_bank, tester, curriculum, replay_buffer,
                             beta_schedule, show_print)

        tf.reset_default_graph()
        sess.close()

        # Backing up the results
        saver.save_results()

    # Showing results
    tester.show_results()
    print("Time:", "%0.2f" % ((time.time() - time_init) / 60), "mins")