def main(start, ends, goal, n_height, n_width, obstacles, weight_file): # env env = maze_grid(start, ends, n_height, n_width, obstacles) # maze env.render() # input('123') # model input_dim = env.observation_space.n hidden_dim = 256 output_dim = env.action_space.n model = dqn.dqn_model(input_dim, hidden_dim, output_dim) model.load_weights(weight_file) print('Goal', goal) # show the demo states = list() env.reset(states, show=goal) s0 = np.reshape(states, [1, input_dim]) total_reward = 0 is_done = False steps = 0 while not is_done: a0 = int(np.argmax(model.predict(s0)[0])) s1, r1, is_done, info = env.step(a0) env.render() time.sleep(0.3) s1 = np.reshape(s1, [1, input_dim]) s0 = s1 total_reward += r1 steps += 1 print('The reward: %d, the agent path distance is %d step.' % (total_reward, steps)) time.sleep(1)
def train_agent(start, ends, n_height, n_width, obstacles, model_file=None): # build the environment env_1 = maze_grid(start, ends, n_height, n_width, obstacles[0]) # maze env_2 = maze_grid(start, ends, n_height, n_width, obstacles[1]) # input('123') # q learning model agent = QAgent( envs=[env_1, env_2], memory_capacity=100 * env_1.maze_size, # experience memory hidden_dim=256, model_file=model_file) print("Learning...") agent.learning(max_episodes=500, batch_size=512, gamma=0.9, min_epsilon=0.01)
def main(starts, end, n_height, n_width, obstacles, weight_file): # --- env start = starts[0] # choose the start env = maze_grid(starts, end, n_height, n_width, obstacles) # maze env.render() # input('123') # --- model model = dqn.dqn_model((3, 3, 1), (5, 5, 3)) # load the weight model.load_weights(weight_file) # --- observation the new states # --- initial agent states agent_init_states = np.ones((5, 5, 1)) agent_3_states = deque(maxlen=3) # 3 continue state for _ in range(3): agent_3_states.append(agent_init_states) goal_states, agent_states = env.reset(start, test=True) # get the initial states # --- observation the new states goal_states = np.reshape(goal_states, [3, 3, 1]) goal_states = goal_states[np.newaxis, :] agent_states = np.reshape(agent_states, [5, 5, 1]) agent_3_states.append(agent_states) # reshape the dim [3, 5, 5, 1] --> [1, 5, 5, 3] agent_s0 = np.concatenate( (agent_3_states[0], agent_3_states[1], agent_3_states[2]), axis=2) agent_s0 = agent_s0[np.newaxis, :] # --- show the demo total_reward = 0 is_done = False steps = 0 while True: s0 = [goal_states, agent_s0] a0 = int(np.argmax(model.predict(s0)[0])) s1, r1, is_done, info = env.step(a0) env.render() steps += 1 total_reward += r1 if is_done: print('Arrive at the goal ...') break env.render() time.sleep(0.3) # update states s1 = np.reshape(s1, [5, 5, 1]) agent_3_states.append(s1) # reshape the dim [3, 5, 5, 1] --> [1, 5, 5, 3] agent_s1 = np.concatenate( (agent_3_states[0], agent_3_states[1], agent_3_states[2]), axis=2) agent_s1 = agent_s1[np.newaxis, :] agent_s0 = agent_s1 print('The reward: %d, the agent path distance is %d step.' % (total_reward, steps)) time.sleep(1)
def train_agent(start, ends, n_height, n_width, obstacles, model_file=None): # build the environment env = maze_grid(start, ends, n_height, n_width, obstacles) #maze env.render() # q learning model agent = QAgent(env, memory_capacity=100*env.maze_size, # experience memory hidden_dim=256, model_file=model_file) print("Learning...") agent.learning(max_episodes=3000, batch_size=512, gamma=0.9, min_epsilon=0.01)
def train_agent(start, ends, obstacles, load_model=False): env = maze_grid(start, ends, obstacles) #maze env.render() agent = QAgent( env, memory_capacity=100 * env.maze_size, # experience memory hidden_dim=100, model_file=load_model) env.reset() print("Learning...") agent.learning(max_episodes=300, batch_size=512, gamma=0.9, min_epsilon=0.01)
def main(start, ends, n_height, n_width, obstacles): # env env = maze_grid(start, ends, n_height, n_width, obstacles) # maze env.render() # model input_dim = env.observation_space.n hidden_dim = 256 output_dim = env.action_space.n model = dqn.dqn_model(input_dim, hidden_dim, output_dim) # load the weight weight_file = 'single_agent_2_goals.h5' model.load_weights(weight_file) reward_record = dict() for i, goal in enumerate(ends): each_goal_record = list() print('Goal', goal) # show the demo for _ in range(10): states = list() env.reset(states, show=goal) env.render() s0 = np.reshape(states, [1, input_dim]) total_reward = 0 is_done = False steps = 0 while not is_done: a0 = int(np.argmax(model.predict(s0)[0])) s1, r1, is_done, info = env.step(a0) env.render() time.sleep(0.1) s1 = np.reshape(s1, [1, input_dim]) s0 = s1 total_reward += r1 steps += 1 print('The reward: %d, the agent path distance is %d step.' % (total_reward, steps)) each_goal_record.append(total_reward) time.sleep(0.5) print('10 times mean reward:%f reward: %s' % (np.mean(each_goal_record), each_goal_record)) reward_record[i + 1] = each_goal_record print(reward_record)
def train_agent(starts, end, n_height, n_width, obstacles, model_file=None): """ build the environment @param starts: list @param end: tuple @param n_height: int @param n_width: int @param obstacles: list @param model_file: str model weights """ env = maze_grid(starts, end, n_height, n_width, obstacles) # maze env.render() # input('show') # q learning model agents = QAgent( env, memory_capacity=50 * env.maze_size, # experience memory model_file=model_file) logging.info("Learning...") agents.learning(max_episodes=1000, batch_size=512, gamma=0.95, min_epsilon=0.01)
# choose the different instance # start = (2, 9) # ends = [(5, 4)] # many ends # instance 1 # obstacles = [(4, 2), (4, 3), (4, 4), (4, 5), (4, 6), (1, 7), (2, 7), (3, 7), (4, 7), (6, 7), (7, 7), (8, 7)] # 1 # instance 2 # obstacles = [(4, 2), (4, 3), (4, 4), (4, 5), (4, 6), (1, 7), (2, 7), (3, 7), (4, 7), (5, 7), (6, 7), (7, 7), (8, 7)] #2 # instance difficult start = (5, 9) ends = [(4, 5)] obstacles = [(2, 7), (3, 7), (4, 7), (5, 7), (6, 7), (7, 7), (3, 5), (6, 5), (2, 3), (3, 3), (6, 3), (7, 3)] env = maze_grid(start, ends, obstacles) model = load_model('./model3/dqn_maze_300.h5') score_list = list() episode = 10 # run 10 times for i in range(episode): state = env.reset() score = 0 while True: env.render() time.sleep(0.05) action = np.argmax(model.predict(np.array([state]))[0]) state, reward, done, _ = env.step(action) score += reward