class TrainDQN: def __init__(self, env, sess, learning_rate=1e-3, seed=1234, gamma=0.99, max_eps=1.0, min_eps=0.1, render=False, print_freq=20, load_path=None, save_path=None, batch_size=32, log_dir='logs/train', max_steps=100000, buffer_capacity=None, max_episode_len=2000, eps_decay_rate=-0.0001, target_update_freq=1000, ): """Trains an openai gym-like environment with deep q learning. Args: env: gym.Env where our agent resides seed: Random seed for reproducibility gamma: Discount factor max_eps: Starting exploration factor min_eps: Exploration factor to decay towards max_episode_len: Maximum length of an individual episode render: True to render the environment, else False print_freq: Displays logging information every 'print_freq' episodes load_path: (str) Path to load existing model from save_path: (str) Path to save model during training max_steps: maximum number of times to sample the environment buffer_capacity: How many state, action, next state, reward tuples the replay buffer should store max_episode_len: Maximum number of timesteps in an episode eps_decay_rate: lambda parameter in exponential decay for epsilon target_update_fraction: Fraction of max_steps update the target network """ np.random.seed(seed) self.sess = sess self.env = env self.input_dim = env.observation_space.shape[0] self.output_dim = env.action_space.n self.max_steps = max_steps self.max_eps = max_eps self.min_eps = min_eps self.eps_decay_rate = eps_decay_rate self.max_episode_len = max_episode_len self.render = render self.print_freq = print_freq self.rewards = [] self.metrics = [] self.save_path = save_path self.load_path = load_path self.batch_size = batch_size self.num_updates = 0 self.gamma = gamma self.buffer = ReplayBuffer(capacity=max_steps // 2 if buffer_capacity is None else buffer_capacity) self.target_update_freq = target_update_freq self.learning_rate = learning_rate with tf.variable_scope('q_network'): self.q_network = QNetworkBuilder(self.input_dim, self.output_dim, (64,)) with tf.variable_scope('target_network'): self.target_network = QNetworkBuilder(self.input_dim, self.output_dim, (64,)) self.update_target_network = [old.assign(new) for (new, old) in zip(tf.trainable_variables('q_network'), tf.trainable_variables('target_network'))] if self.load_path is not None: self.load() self.add_summaries(log_dir) def add_summaries(self, log_dir): tf.summary.scalar('Loss', self.q_network.loss, ) tf.summary.scalar('Mean Estimated Value', tf.reduce_mean(self.q_network.output_pred)) # Merge all the summaries and write them out to log_dir self.merged = tf.summary.merge_all() self.train_writer = tf.summary.FileWriter(log_dir, self.sess.graph) def learn(self): """Learns via Deep-Q-Networks (DQN)""" obs = self.env.reset() mean_reward = None total_reward = 0 ep = 0 ep_len = 0 rand_actions = 0 for t in range(self.max_steps): # weight decay from https://jaromiru.com/2016/10/03/lets-make-a-dqn-implementation/ eps = self.min_eps + (self.max_eps - self.min_eps) * np.exp( self.eps_decay_rate * t) if self.render: self.env.render() # Take exploratory action with probability epsilon if np.random.uniform() < eps: action = self.env.action_space.sample() rand_actions += 1 else: action = self.act(obs) # Execute action in emulator and observe reward and next state new_obs, reward, done, info = self.env.step(action) total_reward += reward # Store transition s_t, a_t, r_t, s_t+1 in replay buffer self.buffer.add((obs, action, reward, new_obs, done)) # Perform learning step self.update() obs = new_obs ep_len += 1 if done or ep_len >= self.max_episode_len: # print("Episode Length:", ep_len) # print(f"Episode {ep} Reward:{total_reward}") # print(f"Random Action Percent: {rand_actions/ep_len}") ep += 1 ep_len = 0 rand_actions = 0 self.rewards.append(total_reward) total_reward = 0 obs = self.env.reset() if ep % self.print_freq == 0 and ep > 0: new_mean_reward = np.mean(self.rewards[-self.print_freq - 1:]) print(f"-------------------------------------------------------") print(f"Mean {self.print_freq} Episode Reward: {new_mean_reward}") print(f"Exploration fraction: {eps}") print(f"Total Episodes: {ep}") print(f"Total timesteps: {t}") print(f"-------------------------------------------------------") # Add reward summary summary = tf.Summary() summary.value.add(tag=f'Mean {self.print_freq} Episode Reward', simple_value=new_mean_reward) summary.value.add(tag=f'Epsilon', simple_value=eps) self.train_writer.add_summary(summary, self.num_updates) # Model saving inspired by Open AI Baseline implementation if (mean_reward is None or new_mean_reward >= mean_reward) and self.save_path is not None: print(f"Saving model due to mean reward increase:{mean_reward} -> {new_mean_reward}") print(f'Location: {self.save_path}') # save_path = f"{self.save_path}_model" self.save() mean_reward = new_mean_reward def act(self, observation): """Takes an action given the observation. Args: observation: observation from the environment Returns: integer index of the selected action """ pred = self.sess.run([self.q_network.output_pred], feed_dict={self.q_network.input_ph: np.reshape(observation, (1, self.input_dim))}) return np.argmax(pred) def update(self): """Applies gradients to the Q network computed from a minibatch of self.batch_size.""" if self.batch_size <= self.buffer.size(): self.num_updates += 1 # Update the Q network with model parameters from the target network if self.num_updates % self.target_update_freq == 0: self.sess.run(self.update_target_network) print('Updated Target Network') # Sample random minibatch of transitions from the replay buffer sample = self.buffer.sample(self.batch_size) states, action, reward, next_states, done = sample # Calculate discounted predictions for the subsequent states using target network next_state_pred = self.gamma * self.sess.run(self.target_network.output_pred, feed_dict={ self.target_network.input_ph: next_states}, ) # Adjust the targets for non-terminal states reward = reward.reshape(len(reward), 1) targets = reward loc = np.argwhere(done != True).flatten() if len(loc) > 0: max_q = np.amax(next_state_pred, axis=1) targets[loc] = np.add( targets[loc], max_q[loc].reshape(max_q[loc].shape[0], 1), casting='unsafe') # Update discount factor and train model on batch _, loss = self.sess.run([self.q_network.opt, self.q_network.loss], feed_dict={self.q_network.input_ph: states, self.q_network.target_ph: targets.flatten(), self.q_network.action_indices_ph: action}) def save(self): """Saves the Q network.""" self.q_network.saver.save(self.sess, self.save_path) def load(self): """Loads the Q network.""" self.q_network.saver.restore(self.sess, self.save_path) def plot_rewards(self, path=None): """Plots rewards per episode. Args: path: Location to save the rewards plot. If None, image will be displayed with plt.show() """ plt.plot(self.rewards) plt.xlabel('Episode') plt.ylabel('Reward') if path is None: plt.show() else: plt.savefig(path) plt.close('all')
class NeuralNetworkAgent(Agent): def __init__(self, api, network_class, sess, save_path, history_size=15, restore_path=None, verbose=False, train=False, test=False): super(NeuralNetworkAgent, self).__init__(api, verbose=verbose) # currently 7500 w/ 1000 # Network self.network = network_class(sess, save_path, restore_path=restore_path, hist_size=history_size) self.replay_buffer = ReplayBuffer(max_size=2500) self.train = train self.history_size = history_size # Internal self.launched = False self.placed_move = False self.ctr = 0 self.restart_game = 1 self.game_restarted = True self.show_board = False self.last_move = -2 self.start_state = np.zeros((20, 10, 1)) self.possible_moves = [-1, 0, 6, 7] self.training_begun = False if not test else True self.epsilon = 1. if not test else 0 self.decay = 0.999 self.test = test self.prev_states = [self.start_state] * self.history_size def _controller_listener(self): piece_id = self.api.peekCPU(0x0042) game_state = self.api.peekCPU(0x0048) if piece_id != 19 and game_state == 1: # Train if self.train and self.replay_buffer.size( ) > 250 and not self.test: batch = self.replay_buffer.sample(batch_sz=250) self.network.train(batch) self.training_begun = True self.epsilon *= self.decay if self.epsilon < 0.010: self.epsilon = 0.010 if not self.placed_move: # and (random_move >= 0 or self.restart_game > 0): # os.system('clear') print '--------------' is_random = False move = None if np.random.random() < self.epsilon or not self.training_begun: move = np.random.choice(self.possible_moves) is_random = True else: tensor = np.dstack([self.grid] + self.prev_states) pred = self.network.predict(tensor)[0] move = self.possible_moves[pred] if self.restart_game > 0: self.api.writeGamepad(0, 3, True) self.restart_game -= 1 move = -2 else: if move >= 0: self.api.writeGamepad(0, move, True) self.placed_move = True self.show_board = True if self.last_move != -2 and piece_id != 19: print 'Random:', is_random S = self.grid.copy() self._update_board(self.api.peekCPU(0x0042)) board = self._simulate_piece_drop(self.api.peekCPU(0x0042)) n_empty = self._count_empty(self.grid) n_holes = self._count_holes(self.grid) height = self._count_height(board) levelness = self._determine_levelness(board) A = self.last_move # R = self._count_total() + self._get_score() - n_empty #R = (-50 * height) + (-20 * n_holes) + (self._get_score()) if height <= 2: R = 1000 else: R = -200 * height R += -20 * n_holes + 10 * levelness # 10 * self._get_score() SP = self.grid.copy() self.prev_states.insert(0, S) print np.dstack(self.prev_states).shape self.replay_buffer.add( np.dstack(self.prev_states), self.possible_moves.index(A), R, np.dstack([SP] + self.prev_states[:self.history_size])) self.prev_states = self.prev_states[:self.history_size] print self.epsilon self._print_transition(S, A, board, R) self.last_move = move else: self.placed_move = False def _frame_render_finished(self): """ Renders the board the the current piece TODO: do this lazily, so we aren't calling read too often O_o """ # To make things easier, we're going to modify the next piece drop # Always drop a certain type of block (currently square). self.api.writeCPU(0x00bf, 0x0a) piece_id = self.api.peekCPU(0x0042) game_state = self.api.peekCPU(0x0048) # Restart the game if piece_id == 19 and (game_state == 10 or game_state == 0): self.prev_states = [self.start_state] * self.history_size self.game_restarted = True self.restart_game = 1 return # Probably a line clear... Skip if piece_id == 19 and game_state != 1: return def _piece_update(self, access_type, address, value): """ Can be used to control the piece being dropped """ if self.api.readCPU(0x0048) == 1: return 0x0a return value def agent_name(self): return 'NeuralNetworkAgent'