class Simulator(object): def __init__(self): super(Simulator, self).__init__() self.model = cfg.model(cfg) self.optim = cfg.optim(cfg.learning_rate) self.loss = cfg.loss self.epoch = tf.Variable(0) self.writer = Writer(cfg) # Restore if save exists if Path('./simulator_saves/best').is_dir(): self.model, self.optim, self.epoch = self.writer.restore( model=self.model, optim=self.optim, epoch=self.epoch) self.preprocessing() def preprocessing(self): if cfg.package_data or not Path('./data.pkl').is_file(): vizdoom = VizDoom(cfg) memory = [] for episode in trange(cfg.gather_epochs): vizdoom.new_episode() s0 = vizdoom.get_preprocessed_state() while not vizdoom.is_episode_finished(): action = random.choice(cfg.actions) vizdoom.make_action(action) s1 = vizdoom.get_preprocessed_state() action = np.reshape( action, [1, 1, len(cfg.actions)]).astype(np.float32) memory.append([s0, action, s1]) s0 = s1 with open('data.pkl', 'wb') as f: pickle.dump(memory, f) # Load data with open(cfg.data_dir, 'rb') as f: s0, action, s1 = zip(*pickle.load(f)) self.size = len(s0) self.data = tf.data.Dataset.from_tensor_slices( (np.array(s0), np.array(action), np.array(s1))) def update(self, s0, action, s1): # Normalize s0_n = tf.image.per_image_standardization(s0) truth = tf.image.per_image_standardization(s1) - s0_n # Construct graph with tf.GradientTape() as tape: # Approximate next frame logits = self.model(s0_n, action) # Compare generated transformation matrix with truth loss = tf.reduce_mean(self.loss(truth, logits)) # Log stats, images self.writer.log(self.optim, tape, loss) self.writer.log_state("logits", logits) self.writer.log_state("truth_logits", truth) # Compute/apply gradients grads = tape.gradient(loss, self.model.trainable_weights) grads_and_vars = zip(grads, self.model.trainable_weights) self.optim.apply_gradients(grads_and_vars) self.writer.global_step.assign_add(1) def train(self): for epoch in trange(self.epoch.numpy(), cfg.epochs): # Uniform shuffle batch = self.data.shuffle(self.size).batch(cfg.batch_size) for s0, action, s1 in batch: self.update(s0, action, s1) self.epoch.assign_add(1) self.writer.save(self.model, self.optim, self.epoch) def predict(self, s0, action): s0_n = tf.image.per_image_standardization(s0) logits = self.model(s0_n, action[None]) return logits + s0_n
class AlphaDoom(object): def __init__(self): super(AlphaDoom, self).__init__() self.mcts = MCTS(cfg) self.replay = Replay(cfg) self.autoencoder = Simulator() self.autoencoder.train() #self.autoencoder = AutoEncoder() #tf.train.Checkpoint(model=self.autoencoder).restore(tf.train.latest_checkpoint('./simulator_saves/best')) # Load selected model self.model = cfg.model(cfg) self.loss1 = cfg.loss1 self.loss2 = cfg.loss2 self.optim = cfg.optim(cfg.learning_rate) self.epoch = tf.Variable(0) self.writer = Writer(cfg) # Restore if save exists if Path('./alphadoom_saves/best').is_dir(): self.model, self.optim, self.epoch = self.writer.restore( self.model, self.optim, self.epoch) self.vizdoom = VizDoom(cfg) def update(self): # Fetch batch of experiences s0, pi, z = self.replay.fetch() z = np.array(z).reshape((len(z), 1)) pi = np.array(pi, dtype=np.float32) # Construct graph with tf.GradientTape() as tape: p, v = self.model(s0) loss1 = self.loss1(z, v) loss2 = self.loss2(pi, p) l2_reg = tf.add_n([tf.nn.l2_loss(v) for v in self.model.weights]) loss = loss1 + loss2 + cfg.c * l2_reg # Log stats self.writer.log(self.optim, tape, loss) self.writer.log_var("MSE", loss1) self.writer.log_var("Cross Entropy", loss2) self.writer.log_var("reg", l2_reg) # Compute/apply gradients grads = tape.gradient(loss, self.model.weights) grads_and_vars = zip(grads, self.model.weights) self.optim.apply_gradients(grads_and_vars) self.writer.global_step.assign_add(1) # Runs N simulations, where each sim reaches a leaf node in MCTS tree def simulate(self): for i in range(cfg.num_sims): # Find leaf leaf = self.mcts.search() # Simulate leaf's state action = np.reshape(leaf.a, [1, 1, len(cfg.actions)]).astype(np.float32) leaf.s = self.autoencoder.predict(leaf.parent.s[-1][None], action) # Get p, the prior probability set of all actions (edges) from current leaf node, and v, the value of current leaf node s = tf.concat(leaf.s, axis=-1) p, v = self.model(s) # Backprop through MCTS tree self.mcts.update(leaf, v, p) # Returns best action def perform_action(self, frames): # Shape (H, W, 1) to (1, H, W, 1) self.mcts.root.s = frames self.simulate() action = self.mcts.select_action() # Take action reward = self.vizdoom.make_action(action) return action, reward def train(self): if Path('/replay.pkl').is_file(): with open('/replay.pkl', 'rb') as f: self.replay.memory = pickle.load(f) for epoch in trange(self.epoch.numpy(), cfg.epochs): self.vizdoom.new_episode() frame = self.vizdoom.get_preprocessed_state() frames = [] # Init stack of n frames for i in range(cfg.num_frames): frames.append(frame) z = 0 memories = [] while not self.vizdoom.is_episode_finished(): pi, reward = self.perform_action(frames) # Check final outcome; +1 to states if positive reward, -1 if negative if reward >= 0: z = 1 break else: z = -1 if self.vizdoom.is_episode_finished == True: break # Update frames with latest image frames.pop(0) frames.append(self.vizdoom.get_preprocessed_state()) s0 = tf.concat(frames, axis=-1) memories.append([s0, pi]) self.writer.log_var("z", z) # Add memories to experience replay for i in range(len(memories)): memories[i].append(z) self.replay.push(memories[i]) # Train on experiences from memory self.update() # Save model if epoch % cfg.save_freq == 0: self.writer.save(self.model, self.optim, self.epoch) with open('./replay.pkl', 'wb') as f: pickle.dump(self.replay.memory, f) self.writer.save(self.model, self.optim, self.epoch)