Пример #1
0
    def test_add_new_walls(self):
        wall_list = induction.get_all_walls(env)
        agent_position = env.unwrapped.observer.get_observation()["position"]
        helper.silentremove(BASE_DIR, "background_test2.lp")
        backgroundfile = os.path.join(BASE_DIR, "background_test2.lp")

        is_new_wall = abduction.add_new_walls(agent_position, wall_list,
                                              backgroundfile)
        self.assertTrue(is_new_wall)
        is_new_wall2 = abduction.add_new_walls(agent_position, wall_list,
                                               backgroundfile)
        self.assertFalse(is_new_wall2)
Пример #2
0
 def test_send_state_transition_pos(self):
     previous_state = [1, 1]
     next_state = [2, 1]
     action = "right"
     wall_list = induction.get_all_walls(env)
     lasfile = os.path.join(BASE_DIR, "las_test2.las")
     helper.silentremove(BASE_DIR, "las_test2.las")
     background = os.path.join(BASE_DIR, "background_test4.lp")
     helper.silentremove(BASE_DIR, "background_test4.lp")
     induction.send_state_transition_pos(previous_state, next_state, action,
                                         wall_list, lasfile, background)
     size_las = os.stat(lasfile).st_size
     size_background = os.stat(background).st_size
     self.assertGreater(size_las, 0)
     self.assertEqual(size_background, 0)
Пример #3
0
 def test_add_surrounding_walls(self):
     walls = induction.get_all_walls(env)
     surrounding = induction.add_surrounding_walls(1, 1, walls)
     self.assertEqual(surrounding,
                      "wall((1, 2)). wall((0, 1)). wall((1, 0)). ")
Пример #4
0
 def test_get_all_walls(self):
     walls = induction.get_all_walls(env)
     self.assertEqual(len(walls), 12)
Пример #5
0
def q_learning(env, num_episodes, discount_factor=1, alpha=0.5, epsilon=0.1, epsilon_decay=1.0):
    """
    Args:
        alpha: TD learning rate
    """
    height = env.unwrapped.game.height
    width = env.unwrapped.game.width

    wall_list = induction.get_all_walls(env)
    
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))
    
    # 4 actions + 2 for X and Y + 4 surroundings
    weights = np.random.rand(10)

    for i_episode in range(num_episodes):
        print("------------------------------")

        # The policy we're following
        # policy = make_epsilon_greedy_policy(
        #     epsilon * epsilon_decay**i_episode, env.action_space.n)
        
        # Print out which episode we're on, useful for debugging.
        # Also print reward for last episode
        last_reward = stats.episode_rewards[i_episode - 1]
        sys.stdout.flush()
        
        # Reset the env and pick the first action
        previous_state = env.reset()
        
        action_probs = np.ones(4, dtype=float)
        for t in range(TIME_RANGE):
            env.render()
            # time.sleep(0.1)
            # Take a step
            # action_probs = policy(state_int, i_episode)
            
            up_wall, down_wall, right_wall, left_wall = helper.check_surrounding_walls(int(previous_state[0]), int(previous_state[1]), wall_list)

            normalised_x = int(previous_state[0])/int(width)
            normalised_y = int(previous_state[1])/int(height)

            for i in range(0,4):
                action_probs[i] = weights[i] + normalised_x*weights[4] + normalised_y*weights[5] + \
                                  int(up_wall)*weights[6] + int(down_wall)*weights[7] + int(right_wall)*weights[8] + int(left_wall)*weights[9]

                # action_probs[i] = weights[i] + int(previous_state[0])*weights[4] + int(previous_state[1])*weights[5] + \
                #                   int(up_wall)*weights[6] + int(down_wall)*weights[7] + int(right_wall)*weights[8] + int(left_wall)*weights[9] 
            action = np.argmax(action_probs)
            print("action ", action)
            # action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            # action = env.action_space.sample()

            # 0: UP
            # 1: DOWN
            # 2: LEFT
            # 3: RIGHT

            next_state, reward, done, _ = env.step(action)
            if done:
                reward = 100
            else:
                reward = reward - 1
            
            # Update stats
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t
            
            # TD Update
            alpha = 0.01
            v_now = weights[action] + normalised_x*weights[4] + normalised_y*weights[5] + \
                                      int(up_wall)*weights[6] + int(down_wall)*weights[7] + int(right_wall)*weights[8] + int(left_wall)*weights[9] 
            
            normalised_next_x = int(next_state[0])/int(width)
            normalised_next_y = int(next_state[1])/int(height)
            up_wall_next, down_wall_next, right_wall_next, left_wall_next = helper.check_surrounding_walls(int(next_state[0]), int(next_state[1]), wall_list)
            
            v_next = weights[action] + normalised_next_x*weights[4] + normalised_next_y*weights[5] + \
                                int(up_wall_next)*weights[6] + int(down_wall_next)*weights[7] + int(right_wall_next)*weights[8] + int(left_wall_next)*weights[9]
            
            # v_next = weights[action] + int(next_state[0])*weights[4] + int(next_state[1])*weights[5] + \
            #                     int(up_wall_next)*weights[6] + int(down_wall_next)*weights[7] + int(right_wall_next)*weights[8] + int(left_wall_next)*weights[9] 
            
            weights_delta = alpha*(reward + discount_factor*v_next - v_now)*weights
            
            print("weights_delta", weights_delta)
            weights = weights - weights_delta
            # weights = weights + weights_delta
            print("weights", weights)
            if math.isnan(weights[0]):
                import ipdb; ipdb.set_trace()
            previous_state = next_state
            
            if done:
                break

        # run_experiment(env,state_int, Q, stats_test, i_episode, width, TIME_RANGE)

    return Q, stats
Пример #6
0
def k_learning(env,
               num_episodes,
               h,
               goal,
               epsilon=0.1,
               record_prefix=None,
               is_link=False):
    # Get cell range for the game
    height = env.unwrapped.game.height
    width = env.unwrapped.game.width
    cell_range = "\ncell((0..{}, 0..{})).\n".format(width - 1, height - 1)

    # Log everything and keep the record here
    log_dir = None
    if record_prefix:
        log_dir = os.path.join(cf.BASE_DIR, "log")
        log_dir = helper.gen_log_dir(log_dir, record_prefix)

    # the first abduction needs lots of basic information
    first_abduction = False

    keep_link = None

    # Clean up all the files first
    helper.silentremove(cf.BASE_DIR, cf.GROUNDING)
    helper.silentremove(cf.BASE_DIR, cf.LASFILE)
    helper.silentremove(cf.BASE_DIR, cf.CLINGOFILE)
    helper.silentremove(cf.BASE_DIR, cf.LAS_CACHE, cf.LAS_CACHE_PATH)
    helper.create_file(cf.BASE_DIR, cf.LAS_CACHE, cf.LAS_CACHE_PATH)
    cf.ALREADY_LINK = False
    # Copy pos examples that used in TL before
    tl_file = os.path.join(cf.BASE_DIR, "tl_pos.las")
    helper.copy_file(tl_file, cf.LASFILE)
    # Add mode bias and adjacent definition for ILASP
    induction.copy_las_base(height, width, cf.LASFILE, is_link)

    # record the current hypothesis
    hypothesis = h
    abduction.make_lp_base(cell_range)

    wall_list = induction.get_all_walls(env)

    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes),
                                  episode_runtime=np.zeros(num_episodes))

    stats_ilasp = plotting.TimeStats(ILASP_runtime=np.zeros((num_episodes,
                                                             cf.TIME_RANGE)))

    stats_test = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                       episode_rewards=np.zeros(num_episodes),
                                       episode_runtime=np.zeros(num_episodes))

    for i_episode in range(num_episodes):
        print("==============NEW EPISODE======================")
        print("i_episode ", i_episode)
        start_total_runtime = time.time()

        previous_state = env.reset()
        agent_position = env.unwrapped.observer.get_observation()["position"]
        env.render()
        previous_state_at = py_asp.state_at(previous_state[0],
                                            previous_state[1], 0)

        t = 0
        # Once the agent reaches the goal, the algorithm kicks in
        # Decaying epsilon greedy params
        # new_epsilon = epsilon*(1/(i_episode+1)**cf.DECAY_PARAM)
        new_epsilon = epsilon
        print("new_epsilon ", new_epsilon)

        while t < cf.TIME_RANGE:
            if first_abduction == False:
                # Convert syntax of H for ASP solver
                hypothesis_asp = py_asp.convert_las_asp(hypothesis)
                abduction.add_hypothesis(hypothesis_asp)
                abduction.add_start_state(agent_position)
                abduction.add_goal_state(goal)
                first_abduction = True

            # Update the starting position for Clingo
            agent_position = env.unwrapped.observer.get_observation(
            )["position"]
            abduction.update_agent_position(agent_position, t)
            abduction.update_time_range(agent_position, t)

            # Run clingo to get a plan
            answer_sets = abduction.run_clingo(cf.CLINGOFILE)
            states_plan, actions_array = abduction.sort_planning(answer_sets)

            # Record clingo
            if record_prefix:
                inputfile = os.path.join(cf.BASE_DIR, cf.CLINGOFILE)
                helper.log_asp(inputfile, answer_sets, log_dir, i_episode, t)

            # Execute the planning
            for action_index, action in enumerate(actions_array):
                print("---------Planning phase---------------------")

                # Flip a coin. If threshold < epsilon, explore randomly
                threshold = random.uniform(0, 1)
                if threshold < new_epsilon:
                    action_int = randint(0, 3)
                    if cf.IS_PRINT:
                        print("Taking a pure random action...",
                              helper.convert_action(action_int))
                else:
                    # Following the plan
                    action_int = helper.get_action(action[1])
                    if cf.IS_PRINT:
                        print("Following the plan...",
                              helper.convert_action(action_int))
                action_string = helper.convert_action(action_int)
                next_state, reward, done, _ = env.step(action_int)
                next_state_at = py_asp.state_at(next_state[0], next_state[1],
                                                t + 1)

                if done:
                    reward = reward + 10
                else:
                    reward = reward - 1

                # Meanwhile, accumulate all background knowlege
                abduction.add_new_walls(previous_state, wall_list,
                                        cf.CLINGOFILE)

                # Make ASP syntax of state transition

                pos1, pos2, link = induction.generate_pos(
                    hypothesis, previous_state, next_state, action_string,
                    wall_list, cell_range)

                if link is not None:
                    keep_link = link
                # Update H if necessary
                if (not induction.check_ILASP_cover(
                        hypothesis, pos1, height, width,
                        keep_link)) or (not induction.check_ILASP_cover(
                            hypothesis, pos2, height, width, keep_link)):
                    start_time = time.time()
                    hypothesis = induction.run_ILASP(cf.LASFILE, cf.CACHE_DIR)
                    ilasp_runtime = (time.time() - start_time)
                    stats_ilasp.ILASP_runtime[i_episode, t] += ilasp_runtime
                    # Convert syntax of H for ASP solver
                    hypothesis_asp = py_asp.convert_las_asp(hypothesis)
                    abduction.update_h(hypothesis_asp)
                    if record_prefix:
                        inputfile = os.path.join(cf.BASE_DIR, cf.LASFILE)
                        helper.log_las(inputfile, hypothesis, log_dir,
                                       i_episode, t)

                previous_state = next_state
                previous_state_at = next_state_at

                # Update stats
                stats.episode_rewards[i_episode] += reward
                stats.episode_lengths[i_episode] = action_index

                env.render()
                # time.sleep(0.1)
                t = t + 1

                if done or (threshold < new_epsilon):
                    break

            if not actions_array:
                t = t + 1

            if done:
                break

        stats.episode_runtime[i_episode] += (time.time() - start_total_runtime)
        run_experiment(env, i_episode, stats_test, width, cf.TIME_RANGE)

    return stats, stats_test, stats_ilasp