def get_reward( shapes, params, env, ep_max_step, rand_param_obj, seed_and_id=None, ): # perturb parameters using seed if seed_and_id is not None: seed, k_id = seed_and_id np.random.seed(seed) # params += sign(k_id) * SIGMA * np.random.randn(params.size) params += sign(k_id) * SIGMA * rand_param_obj.params[k_id] # logger.debug(rand_param_obj.params) p = params_reshape(shapes, params) # run episode # s = env.reset() my_env = MazeEnv(env.maze) my_env.maze.door = (env.maze.door[0], env.maze.door[1]) start = my_env.reset() # start = my_env.get_state() # logger.debug(["kid ",seed_and_id[1] if seed_and_id is not None else None,"door",my_env.maze.door,"env.door",env.maze.door,"start",start]) ep_r = 0. for g in range(ep_max_step): s = my_env.get_state() a = get_action(p, s) s, r, done = my_env.step(a) ep_r += r if done: break return ep_r
def check_if_solved(Q): env = MazeEnv(1) for i in range(14): # optimal solution state = get_state_idx(env.state, env) action = get_eps_action(state, Q, env, 0) reward, done = env.step(action) if done: return True return False
def plot_fig(): env = MazeEnv(1) for n in [0, 5, 50]: env.np_random = np.random.RandomState(1234) # set common seed plot_steps(n, env) plt.legend() plt.xlabel("Episodes") plt.ylabel("Steps") plt.show()
def __init__(self,man_maze_path, work_maze_path, gx=None, gy=None): # creates the 2x2 maze from a saved file. twoXtwo = MazeEnv(maze_file=man_maze_path, gx=gx, gy=gy) # creates the goal mapping [gx, gy] = twoXtwo.maze_view.goal gx, gy = goal_match(gx, gy) # creates the 4x4 maze from a saved file. YOU WILL NEED TO CHANGE THIS LINK!!! fourXfour = MazeEnv(maze_file=work_maze_path, gx=gx, gy=gy) # set class variables self.mazes = [twoXtwo, fourXfour] self.action_space = 4
def get_updates(): # initialization env = MazeEnv(1) Q = np.zeros((env.width * env.height, len(env.actions))) model = {} pq = pQueue() steps = 0 updates = 0 while True: if updates % 100 == 0: if check_if_solved(Q): return updates steps += 1 # step and record result state = get_state_idx(env.state, env) action = get_eps_action(state, Q, env, eps) reward, done = env.step(action) next_state = -1 if done else get_state_idx(env.state, env) model[(state, action)] = (reward, next_state) if done: steps = 0 # calculate priority and put in queue target = reward if not done: max_action = get_eps_action(next_state, Q, env, 0) target += gamma * Q[next_state, max_action] priority = abs(target - Q[state, action]) if priority > delta: pq.enqueue(priority, state, action) # sweep through update queue for i in range(n): if len(pq.queue) == 0: # stop if the queue is empty break # one step qlearn from most urget (s,a) _, s, a = pq.dequeue() r, sp = model[(s, a)] target = r if sp != -1: max_a = get_eps_action(sp, Q, env, 0) target += gamma * Q[sp, max_a] Q[s, a] += alpha * (target - Q[s, a]) # increment count of updates updates += 1 # add predecessors to queue add_pred(model, s, pq, Q, env)
def plot_reward(plus): reward_per_step = np.zeros(steps) env = MazeEnv(3) step_gen = dyna_q(n, env, plus, kappa=kappa, env_switch_to=(4, 1000)) for step in range(steps): _, _, reward, _, _ = next(step_gen) reward_per_step[step] = reward return np.cumsum(reward_per_step)
def train(matrix, epsilonDecay, max_episodes=2000, gamma=0.99, alpha=0.1, **kwargs): env = MazeEnv(matrix) q_table = {} rows, cols = env.maze.board.shape for i in range(rows): for j in range(cols): q_table[(i, j)] = np.array([-50, -50, -50, -50], np.float) epsilon = 1.0 rew_n = [] q_table_history = [] epsilon_history = [] rew_history = [] for episode in range(max_episodes): rew = run_episode(env, q_table, epsilon, gamma=gamma, alpha=alpha, test=False) rew_n.append(rew) epsilon *= epsilonDecay if episode % 20 == 0: print( f'episode: {episode} average rewards: {np.average(rew_n[-100:])}' ) q_table_history.append(copy.deepcopy(q_table)) epsilon_history.append(epsilon) rew_history.append(np.average(rew_n[-100:])) if abs(rew_history[-1] - np.average(rew_history[-4:-1])) < 1 and episode > 100: break return q_table_history, epsilon_history, rew_n
def get_dyna_updates(): # initialization env = MazeEnv(1) Q = np.zeros((env.width * env.height, len(env.actions))) model = {} steps = 0 updates = 0 step_gen = dyna_q(n, env) while True: # step _, _, _, _, done = next(step_gen) steps += 1 updates += 1 # update counts if done: updates += 1 if steps == 14: return updates steps = 0 else: updates += n
# utility instead reward for update parameters (rank transformation) # base = N_KID * 2 # *2 for mirrored samplingsampling # rank = np.arange(1, base + 1) # util_ = np.maximum(0, np.log(base / 2 + 1) - np.log(rank)) # utility = util_ / util_.sum() - 1 / base utility = gen_utility(N_KID) # training net_shapes, net_params = build_net() if os.path.exists(PARAMS_FILE_NAME): net_params = np.load(PARAMS_FILE_NAME) # env = gym.make(CONFIG['game']).unwrapped maze = None if os.path.exists("maze.json"): maze = Maze.reload("maze.json") env = MazeEnv(maze=maze) env.maze.save() optimizer = SGD(net_params, LR) optimizer.load() # 会尝试load一下保存的参数。 pool = mp.Pool(processes=N_CORE) mar = None # moving average reward param_rand_obj = ParamsRand((N_KID * 2, net_params.size)) for g in range(N_GENERATION): t0 = time.time() net_params = dropout(net_params, 0.3) # dropout net_params, kid_rewards = train(net_shapes, net_params, optimizer, utility, pool, param_rand_obj) # np.savetxt("params.csv", net_params, fmt="%0.3f", delimiter=',', newline='\n') np.save(PARAMS_FILE_NAME, net_params)
state_values[i, j] = np.max(value_list) max_delta_value = abs(old_state_values - state_values).max() if max_delta_value < 1e-4: break # Output a deterministic policy for i in range(self.env.GRID_SIZE): for j in range(self.env.GRID_SIZE): if [i, j] not in self.env.terminate_space: value_list = [] for action in range(self.env.nA): [next_i, next_j], reward, done = self.env.step([i, j], action) value = 0.1 * (reward + self.discount * state_values[next_i, next_j]) value_list.append(value) self.pi[i, j] = np.argmax(value_list) print(self.pi) def initialization(self): state_values = np.zeros((self.env.GRID_SIZE, self.env.GRID_SIZE)) self.pi = np.zeros((self.env.GRID_SIZE, self.env.GRID_SIZE), dtype=int) # Set initial policy to moving left return state_values if __name__ == '__main__': env = MazeEnv() value_iteration = ValueIteration(env) value_iteration.run()
from maze_env import MazeEnv from dynamic_programming import policy_iteration, value_iteration, truncated_policy_iteration from util import print_policy, print_path import time if __name__ == '__main__': env = MazeEnv(10, 10, 0.3) with open('mazefile', 'w') as f: f.write(str(env.maze)) # obtain the optimal policy and optimal state-value function print('\n') LINELEN = 100 print('\t\tValue Iteration') print('-' * LINELEN) start_time = time.time() policy_pi, V_pi = value_iteration(env, max_iter=100) end_time = time.time() # print the optimal policy print("Optimal Policy:", '\n') print_policy(policy_pi, env.nrow, env.ncol, env.maze) print_path(policy_pi, env.nrow, env.ncol, env.maze) print("Runtime: " + str(end_time - start_time)) print('-' * LINELEN) print('\n') print('\t\tPolicy Iteration')
score_func = ranked_avg_knn_scores if parameters.score_func == "avg_knn_scores": score_func = avg_knn_scores knn = batch_knn if parameters.knn == "batch_count_scaled_knn": knn = batch_count_scaled_knn parameters.score_func = score_func parameters.knn = knn internal_dim = None if not hasattr( parameters, 'internal_dim') else parameters.internal_dim env = MazeEnv(rng=rng, size_maze=parameters.size_maze, intern_dim=internal_dim) dataset = None # reload dataset here if need be if hasattr(parameters, 'dataset_fname' ) and parameters.dataset_fname is not None and continue_running: from deer.helper.data import DataSet dataset = DataSet.load(parameters.dataset_fname) parameters.dataset_fname = None env = dataset._environment plotter = Plotter(parameters.experiment_dir, env_name=h,
print("Episode %d timed out at %d with total reward = %f." % (episode, t, total_reward)) # It's considered done when it's solved over 120 times consecutively if num_streaks > STREAK_TO_END: break # Initialize the "maze" environment maze_path = None #ToDo: replace this with the path to a maze file maze_size = ( 4, 4 ) #ToDo: instead of specifying a maze path, you can specify a maze size (x,y) and a random maze will be generated gx = None #ToDo: None makes the goal in a random place. Change this to the x coordinate of your desired goal in the maze to have it fixed gy = None #ToDo: None makes the goal in a random place. Change this to the y coordinate of your desired goal in the maze to have it fixed env = MazeEnv(maze_file=maze_path, maze_size=maze_size, gx=gx, gy=gy) ''' Defining the environment related constants ''' # Number of discrete states (bucket) per state dimension MAZE_SIZE = tuple((env.observation_space.high + np.ones(env.observation_space.shape)).astype(int)) NUM_BUCKETS = MAZE_SIZE # one bucket per grid # Number of discrete actions NUM_ACTIONS = env.action_space.n # ["N", "S", "E", "W"] # Bounds for each discrete state STATE_BOUNDS = list(zip(env.observation_space.low, env.observation_space.high)) ''' Learning related constants '''
import gym import numpy as np from maze_env import MazeEnv from stable_baselines import DQN, PPO2, A2C, ACKTR from stable_baselines.bench import Monitor from stable_baselines.common.vec_env import DummyVecEnv env = MazeEnv(grid_size=10) env = Monitor(env, filename=None, allow_early_resets=True) env = DummyVecEnv([lambda: env]) # Train the agent model = ACKTR('MlpPolicy', env, verbose=1) def evaluate(model, num_episodes=100): """ Evaluate a RL agent :param model: (BaseRLModel object) the RL Agent :param num_episodes: (int) number of episodes to evaluate it :return: (float) Mean reward for the last num_episodes """ # This function will only work for a single Environment env = model.get_env() all_episode_rewards = [] for i in range(num_episodes): episode_rewards = [] done = False obs = env.reset() while not done: # _states are only useful when using LSTM policies
def set_maze(config): mazename = maze global env env = MazeEnv(mazename)