def classify_digits(): digits = load_digits() images_and_labels = list(zip(digits.images, digits.target)) n_samples = len(digits.target) # X data = digits.images.reshape(n_samples, -1) # Y Y = np.zeros((n_samples, 10)) for i, y in enumerate(Y): Y[i, digits.target[i]] = 1 # for index, (image, label) in enumerate(images_and_labels[:4]): # plt.subplot(2, 4, index + 1) # plt.axis('off') # plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest') # plt.title('Training: %i' % label) # plt.show() nn = NN(64, 30, 10, 1, 100, 0.002) nn.feed(Y, data) nn.train() nn.predict(digits.target, data)
class DQN: def __init__( self, # About enviroment env, *, max_step: int, # About RL gamma: float, epsilon: float = 1.0, epsilon_min: float = 0.05, epsilon_decay: float = 0.05, update_episode_n: int, # About history history_max_n: int, # About NN nnet_epoch: int, eta: float, eta_decay: float = 1.0, batch_size: int = 20, # About Validation episode_per_validation: int = 5, # Misc save_dir: str = 'hao123') -> None: ''' env: The enviroment to play max_step: gamma: epsilon: epsilon_min: epsilon_decay: update_episode_n: Update NN per episode history_max_n: Maximum size of history nnet_epoch: Epoch to run each NN update eta: Learning rate eta_decay: Learning rate decay per episode batch_size: save_dir: Dir name to save model (save at {save_dir}/datestring) ''' self.env = env self.save_dir = path.join(save_dir, datetime.now().strftime('%Y%m%d%H%M%S')) self.model_save_dir = path.join(self.save_dir, 'model') self.summary_save_dir = path.join(self.save_dir, 'summary') self.obs_shape = env.observation_space.shape self.action_n = env.action_space.n self.max_step = max_step self.gamma = gamma self.epsilon = epsilon self.epsilon_min = epsilon_min self.epsilon_decay = epsilon_decay self.update_episode_n = update_episode_n self.history_max_n = history_max_n self.nnet_epoch = nnet_epoch self.eta = eta self.eta_decay = eta_decay self.batch_size = batch_size self.episode_per_validation = episode_per_validation self.best_reward = -1e9 self.history = [] # type: List[Experience] self.nn = NN(self.obs_shape, self.action_n, batch_size) self.summary = tf.summary.FileWriter(self.summary_save_dir) self.global_step = 0 def get_action(self, obs: Observation, use_epsilon: bool = True) -> Action: if use_epsilon: return self._epsilon_greedy(obs, self.epsilon) else: return self._epsilon_greedy(obs, self.epsilon_min) def _epsilon_greedy(self, obs: Observation, eps: float) -> Action: r = np.random.random() if r < eps: return np.random.randint(self.action_n) else: return self._best_action(obs) def _best_action(self, obs: Observation) -> Action: act_value, _ = self.nn.feed(obs) return act_value def add_summary(self, tag: str, value: float, global_step: int = None): if global_step is None: global_step = self.global_step self.summary.add_summary( make_simple_summary(tag, value), global_step=global_step, ) def start_training(self, reward_target: float = 1e9) -> None: iter_n = 0 while True: #Training iter_n += 1 print(colored(f'[Iter #{iter_n}]', 'green')) print(colored('[Generating Episode]', 'cyan'), f'ε = {self.epsilon:.4f}') reward_avg = 0. for _ in range(self.update_episode_n): reward, history = self.start_episode(True) reward_avg += reward self.history.extend(history) reward_avg /= self.update_episode_n self.add_summary('reward', reward_avg) self.add_summary('eta', self.eta) self.add_summary('epsilon', self.epsilon) print(f'Played {self.update_episode_n} episodes,', colored(f'R_avg = {reward_avg:.4f}', 'red', attrs=['bold'])) self.update() # Testing if iter_n % self.episode_per_validation == 0: print( colored(f'[Validation #{iter_n}]', 'blue', attrs=['bold'])) reward_avg = 0. test_n = 10 for i in range(test_n): reward_avg += self.start_episode(False)[0] reward_avg /= test_n self.add_summary('reward_val', reward_avg) print( colored('[Validation result]', 'blue', attrs=['bold']), colored(f'R_avg = {reward_avg:.4f}', 'red', attrs=['bold'])) self.best_reward = max(self.best_reward, reward_avg) # Return if the target is reached if reward_avg >= reward_target: return self.summary.flush() self.epsilon = max(self.epsilon_min, self.epsilon - self.epsilon_decay) self.eta *= self.eta_decay def start_episode( self, use_epsilon: bool = False) -> Tuple[Reward, List[Experience]]: history = [] obs = self.env.reset() reward_tot = 0. for step in range(self.max_step): # Small hack to decide if training or not if use_epsilon: self.global_step += 1 action = self.get_action(obs, use_epsilon) obs_next, reward, done, info = self.env.step(action) history.append( Experience(obs, action, reward, None if done else obs_next)) reward_tot += reward if done: break obs = obs_next return (reward_tot, history) def update(self) -> None: history_n = len(self.history) # If history_n > history_max_n, delete the oldest if history_n > self.history_max_n: d = history_n - self.history_max_n self.history = self.history[d:] obs, action, reward, obs_next = zip(*self.history) obs_arr = np.array(obs) action_arr = np.array(action) # Calculate Q(s) ≈ max Q(s,a | theta') using obs_next Qsa = np.array([ o if o is not None else np.zeros(self.obs_shape) for o in obs_next ]) Q_next = np.max(self.nn.feed(Qsa)[1], axis=1) for i, x in enumerate(obs_next): if x is None: Q_next[i] = 0. target_arr = np.array(reward) + self.gamma * Q_next n = len(action) data = TrainingData(obs_arr, action_arr, target_arr, self.batch_size) # real_epoch = int(self.nnet_epoch * self.history_max_n / len(inp_arr)) self.nn.train(data, self.nnet_epoch, self.eta)