예제 #1
0
def get_reward(
        shapes,
        params,
        env,
        ep_max_step,
        rand_param_obj,
        seed_and_id=None,
):
    # perturb parameters using seed
    if seed_and_id is not None:
        seed, k_id = seed_and_id
        np.random.seed(seed)
        # params += sign(k_id) * SIGMA * np.random.randn(params.size)
        params += sign(k_id) * SIGMA * rand_param_obj.params[k_id]
        # logger.debug(rand_param_obj.params)
    p = params_reshape(shapes, params)
    # run episode
    # s = env.reset()
    my_env = MazeEnv(env.maze)
    my_env.maze.door = (env.maze.door[0], env.maze.door[1])
    start = my_env.reset()
    # start = my_env.get_state()
    # logger.debug(["kid ",seed_and_id[1] if seed_and_id is not None else None,"door",my_env.maze.door,"env.door",env.maze.door,"start",start])
    ep_r = 0.
    for g in range(ep_max_step):
        s = my_env.get_state()
        a = get_action(p, s)
        s, r, done = my_env.step(a)
        ep_r += r
        if done:
            break
    return ep_r
예제 #2
0
def check_if_solved(Q):
    env = MazeEnv(1)
    for i in range(14):  # optimal solution
        state = get_state_idx(env.state, env)
        action = get_eps_action(state, Q, env, 0)
        reward, done = env.step(action)
    if done:
        return True
    return False
예제 #3
0
def plot_fig():
    env = MazeEnv(1)
    for n in [0, 5, 50]:
        env.np_random = np.random.RandomState(1234)  # set common seed
        plot_steps(n, env)
    plt.legend()
    plt.xlabel("Episodes")
    plt.ylabel("Steps")
    plt.show()
예제 #4
0
 def __init__(self,man_maze_path, work_maze_path, gx=None, gy=None):
     # creates the 2x2 maze from a saved file.
     twoXtwo = MazeEnv(maze_file=man_maze_path, gx=gx, gy=gy)
     # creates the goal mapping
     [gx, gy] = twoXtwo.maze_view.goal
     gx, gy = goal_match(gx, gy)
     # creates the 4x4 maze from a saved file. YOU WILL NEED TO CHANGE THIS LINK!!!
     fourXfour = MazeEnv(maze_file=work_maze_path, gx=gx, gy=gy)
     # set class variables
     self.mazes = [twoXtwo, fourXfour]
     self.action_space = 4
예제 #5
0
def get_updates():
    # initialization
    env = MazeEnv(1)
    Q = np.zeros((env.width * env.height, len(env.actions)))
    model = {}
    pq = pQueue()
    steps = 0
    updates = 0

    while True:
        if updates % 100 == 0:
            if check_if_solved(Q):
                return updates

        steps += 1

        # step and record result
        state = get_state_idx(env.state, env)
        action = get_eps_action(state, Q, env, eps)
        reward, done = env.step(action)
        next_state = -1 if done else get_state_idx(env.state, env)
        model[(state, action)] = (reward, next_state)

        if done:
            steps = 0

        # calculate priority and put in queue
        target = reward
        if not done:
            max_action = get_eps_action(next_state, Q, env, 0)
            target += gamma * Q[next_state, max_action]
        priority = abs(target - Q[state, action])
        if priority > delta:
            pq.enqueue(priority, state, action)

        # sweep through update queue
        for i in range(n):
            if len(pq.queue) == 0:  # stop if the queue is empty
                break

            # one step qlearn from most urget (s,a)
            _, s, a = pq.dequeue()
            r, sp = model[(s, a)]
            target = r
            if sp != -1:
                max_a = get_eps_action(sp, Q, env, 0)
                target += gamma * Q[sp, max_a]
            Q[s, a] += alpha * (target - Q[s, a])

            # increment count of updates
            updates += 1

            # add predecessors to queue
            add_pred(model, s, pq, Q, env)
예제 #6
0
def plot_reward(plus):
    reward_per_step = np.zeros(steps)
    env = MazeEnv(3)
    step_gen = dyna_q(n, env, plus, kappa=kappa, env_switch_to=(4, 1000))
    for step in range(steps):
        _, _, reward, _, _ = next(step_gen)
        reward_per_step[step] = reward
    return np.cumsum(reward_per_step)
예제 #7
0
def train(matrix,
          epsilonDecay,
          max_episodes=2000,
          gamma=0.99,
          alpha=0.1,
          **kwargs):

    env = MazeEnv(matrix)

    q_table = {}

    rows, cols = env.maze.board.shape
    for i in range(rows):
        for j in range(cols):
            q_table[(i, j)] = np.array([-50, -50, -50, -50], np.float)

    epsilon = 1.0

    rew_n = []

    q_table_history = []
    epsilon_history = []
    rew_history = []

    for episode in range(max_episodes):

        rew = run_episode(env,
                          q_table,
                          epsilon,
                          gamma=gamma,
                          alpha=alpha,
                          test=False)

        rew_n.append(rew)

        epsilon *= epsilonDecay

        if episode % 20 == 0:
            print(
                f'episode: {episode} average rewards: {np.average(rew_n[-100:])}'
            )
            q_table_history.append(copy.deepcopy(q_table))
            epsilon_history.append(epsilon)
            rew_history.append(np.average(rew_n[-100:]))
            if abs(rew_history[-1] -
                   np.average(rew_history[-4:-1])) < 1 and episode > 100:
                break

    return q_table_history, epsilon_history, rew_n
예제 #8
0
def get_dyna_updates():
    # initialization
    env = MazeEnv(1)
    Q = np.zeros((env.width * env.height, len(env.actions)))
    model = {}
    steps = 0
    updates = 0

    step_gen = dyna_q(n, env)
    while True:
        # step
        _, _, _, _, done = next(step_gen)
        steps += 1
        updates += 1

        # update counts
        if done:
            updates += 1
            if steps == 14:
                return updates
            steps = 0
        else:
            updates += n
예제 #9
0
    # utility instead reward for update parameters (rank transformation)
    # base = N_KID * 2  # *2 for mirrored samplingsampling
    # rank = np.arange(1, base + 1)
    # util_ = np.maximum(0, np.log(base / 2 + 1) - np.log(rank))
    # utility = util_ / util_.sum() - 1 / base
    utility = gen_utility(N_KID)

    # training
    net_shapes, net_params = build_net()
    if os.path.exists(PARAMS_FILE_NAME):
        net_params = np.load(PARAMS_FILE_NAME)
    # env = gym.make(CONFIG['game']).unwrapped
    maze = None
    if os.path.exists("maze.json"):
        maze = Maze.reload("maze.json")
    env = MazeEnv(maze=maze)
    env.maze.save()

    optimizer = SGD(net_params, LR)
    optimizer.load()  # 会尝试load一下保存的参数。
    pool = mp.Pool(processes=N_CORE)
    mar = None  # moving average reward
    param_rand_obj = ParamsRand((N_KID * 2, net_params.size))
    for g in range(N_GENERATION):
        t0 = time.time()
        net_params = dropout(net_params, 0.3)  # dropout
        net_params, kid_rewards = train(net_shapes, net_params, optimizer,
                                        utility, pool, param_rand_obj)

        # np.savetxt("params.csv", net_params, fmt="%0.3f", delimiter=',', newline='\n')
        np.save(PARAMS_FILE_NAME, net_params)
예제 #10
0
                        state_values[i, j] = np.max(value_list)

            max_delta_value = abs(old_state_values - state_values).max()
            if max_delta_value < 1e-4:
                break

        # Output a deterministic policy
        for i in range(self.env.GRID_SIZE):
            for j in range(self.env.GRID_SIZE):
                if [i, j] not in self.env.terminate_space:
                    value_list = []
                    for action in range(self.env.nA):
                        [next_i, next_j], reward, done = self.env.step([i, j],
                                                                       action)
                        value = 0.1 * (reward + self.discount *
                                       state_values[next_i, next_j])
                        value_list.append(value)
                    self.pi[i, j] = np.argmax(value_list)
        print(self.pi)

    def initialization(self):
        state_values = np.zeros((self.env.GRID_SIZE, self.env.GRID_SIZE))
        self.pi = np.zeros((self.env.GRID_SIZE, self.env.GRID_SIZE),
                           dtype=int)  # Set initial policy to moving left
        return state_values


if __name__ == '__main__':
    env = MazeEnv()
    value_iteration = ValueIteration(env)
    value_iteration.run()
예제 #11
0
from maze_env import MazeEnv
from dynamic_programming import policy_iteration, value_iteration, truncated_policy_iteration
from util import print_policy, print_path
import time

if __name__ == '__main__':
    env = MazeEnv(10, 10, 0.3)

    with open('mazefile', 'w') as f:
        f.write(str(env.maze))

    # obtain the optimal policy and optimal state-value function
    print('\n')
    LINELEN = 100
    print('\t\tValue Iteration')
    print('-' * LINELEN)

    start_time = time.time()
    policy_pi, V_pi = value_iteration(env, max_iter=100)
    end_time = time.time()

    # print the optimal policy
    print("Optimal Policy:", '\n')
    print_policy(policy_pi, env.nrow, env.ncol, env.maze)
    print_path(policy_pi, env.nrow, env.ncol, env.maze)
    print("Runtime: " + str(end_time - start_time))
    print('-' * LINELEN)

    print('\n')

    print('\t\tPolicy Iteration')
예제 #12
0
    score_func = ranked_avg_knn_scores
    if parameters.score_func == "avg_knn_scores":
        score_func = avg_knn_scores

    knn = batch_knn
    if parameters.knn == "batch_count_scaled_knn":
        knn = batch_count_scaled_knn

    parameters.score_func = score_func
    parameters.knn = knn

    internal_dim = None if not hasattr(
        parameters, 'internal_dim') else parameters.internal_dim

    env = MazeEnv(rng=rng,
                  size_maze=parameters.size_maze,
                  intern_dim=internal_dim)

    dataset = None
    # reload dataset here if need be

    if hasattr(parameters, 'dataset_fname'
               ) and parameters.dataset_fname is not None and continue_running:
        from deer.helper.data import DataSet

        dataset = DataSet.load(parameters.dataset_fname)
        parameters.dataset_fname = None
        env = dataset._environment

    plotter = Plotter(parameters.experiment_dir,
                      env_name=h,
예제 #13
0
                print("Episode %d timed out at %d with total reward = %f." %
                      (episode, t, total_reward))

        # It's considered done when it's solved over 120 times consecutively
        if num_streaks > STREAK_TO_END:
            break


# Initialize the "maze" environment
maze_path = None  #ToDo: replace this with the path to a maze file
maze_size = (
    4, 4
)  #ToDo: instead of specifying a maze path, you can specify a maze size (x,y) and a random maze will be generated
gx = None  #ToDo: None makes the goal in a random place. Change this to the x coordinate of your desired goal in the maze to have it fixed
gy = None  #ToDo: None makes the goal in a random place. Change this to the y coordinate of your desired goal in the maze to have it fixed
env = MazeEnv(maze_file=maze_path, maze_size=maze_size, gx=gx, gy=gy)
'''
Defining the environment related constants
'''
# Number of discrete states (bucket) per state dimension
MAZE_SIZE = tuple((env.observation_space.high +
                   np.ones(env.observation_space.shape)).astype(int))
NUM_BUCKETS = MAZE_SIZE  # one bucket per grid

# Number of discrete actions
NUM_ACTIONS = env.action_space.n  # ["N", "S", "E", "W"]
# Bounds for each discrete state
STATE_BOUNDS = list(zip(env.observation_space.low, env.observation_space.high))
'''
Learning related constants
'''
예제 #14
0
import gym
import numpy as np
from maze_env import MazeEnv
from stable_baselines import DQN, PPO2, A2C, ACKTR
from stable_baselines.bench import Monitor
from stable_baselines.common.vec_env import DummyVecEnv

env = MazeEnv(grid_size=10)
env = Monitor(env, filename=None, allow_early_resets=True)
env = DummyVecEnv([lambda: env])

# Train the agent
model = ACKTR('MlpPolicy', env, verbose=1)


def evaluate(model, num_episodes=100):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_episodes: (int) number of episodes to evaluate it
    :return: (float) Mean reward for the last num_episodes
    """
    # This function will only work for a single Environment
    env = model.get_env()
    all_episode_rewards = []
    for i in range(num_episodes):
        episode_rewards = []
        done = False
        obs = env.reset()
        while not done:
            # _states are only useful when using LSTM policies
예제 #15
0
def set_maze(config):
    mazename = maze
    global env
    env = MazeEnv(mazename)