def qtrain(model, grid, **opt): global epsilon n_epoch = opt.get('epochs',1001) max_memory = opt.get('max_memory', 1000) data_size = opt.get('data_size', 50) actions=['Left','Up','Right','Down'] qgrid = Qgrid(grid) experience = Experience(model, max_memory=max_memory) n_free_cells = len(qgrid.free_cells) hsize = qgrid.grid.size//2 win_rate = 0.0 imctr = 1 all_state=[] for epoch in range(n_epoch): states=[] all_action=[] loss = 0.0 rat_cell = random.choice(qgrid.free_cells) rat_cell = (2,3) qgrid.reset(rat_cell) game_over = False envstate = qgrid.observe() n_episodes = 0 while not game_over: valid_actions = qgrid.valid_actions() if not valid_actions: break prev_envstate = envstate states.append((qgrid.state[0],qgrid.state[1])) if np.random.rand() < epsilon: action = random.choice(valid_actions) else: action = np.argmax(experience.predict(prev_envstate)) all_action.append(actions[action]) envstate, reward, game_status = qgrid.act(action) if game_status == 'win': game_over = True elif game_status == 'lose': game_over = True else: game_over = False episode = [prev_envstate, action, reward, envstate, game_over] experience.remember(episode) n_episodes += 1 inputs, targets = experience.get_data(data_size=data_size) model.fit( inputs, targets, epochs=8, batch_size=16, verbose=0, ) loss = model.evaluate(inputs, targets, verbose=0) print("epoch",epoch, 'actions',all_action) all_state.append((states)) qgrid.draw(all_state)
def play_episode(self): episode_reward = 0 local_steps = 0 done = False state = self.env.reset() while not done: action, _ = self.policy.sample_action(np.atleast_2d(state)) action = action.numpy()[0] try: next_state, reward, done, _ = self.env.step(action) except: print("DEBUG", action) #reward = np.clip(reward, -5, 5) exp = Experience(state, action, reward, next_state, done) self.replay_buffer.push(exp) state = next_state episode_reward += reward local_steps += 1 self.global_steps += 1 if (len(self.replay_buffer) >= self.MIN_EXPERIENCES and self.global_steps % self.UPDATE_PERIOD == 0): self.update_networks() return episode_reward, local_steps, tf.exp(self.log_alpha)
def qtrain(model, maze, **opt): global epsilon n_epoch = opt.get('epochs', 1001) max_memory = opt.get('max_memory', 1000) data_size = opt.get('data_size', 50) qmaze = Qmaze(maze) # Initialize experience replay object experience = Experience(model, max_memory=max_memory) win_history = [] # history of win/lose game n_free_cells = len(qmaze.free_cells) hsize = qmaze.maze.size // 2 # history window size win_rate = 0.0 imctr = 1 for epoch in range(n_epoch): states = [] loss = 0.0 rat_cell = random.choice(qmaze.free_cells) if epoch % 10 == 0: rat_cell = (2, 3) qmaze.reset(rat_cell) game_over = False # get initial envstate (1d flattened canvas) envstate = qmaze.observe() n_episodes = 0 while not game_over: valid_actions = qmaze.valid_actions() if not valid_actions: break prev_envstate = envstate states.append((qmaze.state[0], qmaze.state[1])) # Get next action if np.random.rand() < epsilon: action = random.choice(valid_actions) else: action = np.argmax(experience.predict(prev_envstate)) # Apply action, get reward and new envstate envstate, reward, game_status = qmaze.act(action) if game_status == 'win': win_history.append(1) game_over = True elif game_status == 'lose': win_history.append(0) game_over = True else: game_over = False # Store episode (experience) episode = [prev_envstate, action, reward, envstate, game_over] experience.remember(episode) n_episodes += 1 # Train neural network model inputs, targets = experience.get_data(data_size=data_size) loss = model.fit(inputs, targets, 1) # value= model.predict(inputs) # loss = rm.mean_squared_error(value, targets) # loss.grad().update(rm.Adam(0.001)) if len(win_history) > hsize: win_rate = sum(win_history[-hsize:]) / hsize template = "Epoch: {:03d}/{:d} | Loss: {:.4f} | Episodes: {:d} | Win count: {:d} " print( template.format(epoch, n_epoch - 1, loss, n_episodes, sum(win_history), win_rate)) print(qmaze.draw(states))
def learn(self, n_episodes, logdir="log"): logdir = Path(__file__).parent / logdir if logdir.exists(): shutil.rmtree(logdir) self.summary_writer = tf.summary.create_file_writer(str(logdir)) for episode in range(1, n_episodes+1): env = gym.make(self.env_name) frames = collections.deque(maxlen=4) frame = frame_preprocess(env.reset()) for _ in range(self.n_frames): frames.append(frame) episode_rewards = 0 episode_steps = 0 done = False lives = 5 while not done: self.steps += 1 episode_steps += 1 state = np.stack(frames, axis=2)[np.newaxis, ...] action = self.fqf_network.sample_action(state, epsilon=self.epsilon) next_frame, reward, done, info = env.step(action) episode_rewards += reward frames.append(frame_preprocess(next_frame)) next_state = np.stack(frames, axis=2)[np.newaxis, ...] if done: exp = Experience(state, action, reward, next_state, done) self.replay_buffer.push(exp) break else: if info["ale.lives"] != lives: #: life loss as episode ends lives = info["ale.lives"] exp = Experience(state, action, reward, next_state, True) else: exp = Experience(state, action, reward, next_state, done) self.replay_buffer.push(exp) if (len(self.replay_buffer) > 50000) and (self.steps % self.update_period == 0): loss, loss_fp, entropy = self.update_network() with self.summary_writer.as_default(): tf.summary.scalar("loss", loss, step=self.steps) tf.summary.scalar("loss_fp", loss_fp, step=self.steps) tf.summary.scalar("entropy", entropy, step=self.steps) tf.summary.scalar("epsilon", self.epsilon, step=self.steps) tf.summary.scalar("buffer_size", len(self.replay_buffer), step=self.steps) tf.summary.scalar("train_score", episode_rewards, step=self.steps) tf.summary.scalar("train_steps", episode_steps, step=self.steps) #: Target update if self.steps % self.target_update_period == 0: self.target_fqf_network.set_weights( self.fqf_network.get_weights()) print(f"Episode: {episode}, score: {episode_rewards}, steps: {episode_steps}") if episode % 20 == 0: test_scores, test_steps = self.test_play(n_testplay=1) with self.summary_writer.as_default(): tf.summary.scalar("test_score", test_scores[0], step=self.steps) tf.summary.scalar("test_step", test_steps[0], step=self.steps) if episode % 500 == 0: self.fqf_network.save_weights("checkpoints/fqfnet") print("Model Saved")
def qtrain(model, maze, **opt): global epsilon n_epoch = opt.get('epochs',1001) max_memory = opt.get('max_memory', 1000) data_size = opt.get('data_size', 50) qmaze = Qmaze(maze) experience = Experience(model, max_memory=max_memory) win_history = [] n_free_cells = len(qmaze.free_cells) hsize = qmaze.maze.size//2 win_rate = 0.0 imctr = 1 for epoch in range(n_epoch): states=[] loss = 0.0 rat_cell = random.choice(qmaze.free_cells) if epoch%10==0: rat_cell = (2,3) qmaze.reset(rat_cell) game_over = False envstate = qmaze.observe() n_episodes = 0 while not game_over: valid_actions = qmaze.valid_actions() if not valid_actions: break prev_envstate = envstate states.append((qmaze.state[0],qmaze.state[1])) if np.random.rand() < epsilon: action = random.choice(valid_actions) else: action = np.argmax(experience.predict(prev_envstate)) envstate, reward, game_status = qmaze.act(action) if game_status == 'win': win_history.append(1) game_over = True elif game_status == 'lose': win_history.append(0) game_over = True else: game_over = False episode = [prev_envstate, action, reward, envstate, game_over] experience.remember(episode) n_episodes += 1 inputs, targets = experience.get_data(data_size=data_size) loss=model.fit(inputs, targets,1) if len(win_history) > hsize: win_rate = sum(win_history[-hsize:]) / hsize template = "Epoch: {:03d}/{:d} | Loss: {:.4f} | Episodes: {:d} | Win count: {:d} " print(template.format(epoch, n_epoch-1, loss, n_episodes, sum(win_history), win_rate)) print(qmaze.draw(states))