def teach(self, num_timesteps=2000): chosen_action = 0 print('Initial Chosen Action:', chosen_action) for t in range(num_timesteps): slopes = [estimate_slope(timesteps, scores) if len(scores) > 1 else 1 for timesteps, scores in zip(self.timesteps, self.scores)] if self.env.signal == 'SPG': reward = np.abs(slopes[chosen_action]) if self.abs else slopes[chosen_action] elif self.env.signal == 'MPG': reward = np.mean(np.abs(slopes) if self.abs else slopes) #p = self.policy(np.abs(slopes) if self.abs else slopes) p = self.policy(reward) temp = np.zeros(self.env.num_actions) temp[p] = 1. p = temp.copy() r, train_done, val_done = self.env.step(p) if val_done: return self.env.model.epochs for a, s in enumerate(r): if not np.isnan(s): self.scores[a].append(s) self.timesteps[a].append(t) if self.writer: for i in range(self.env.num_actions): add_summary(self.writer, "slopes/task_%d" % (i + 1), slopes[i], self.env.model.epochs) add_summary(self.writer, "probabilities/task_%d" % (i + 1), p[i], self.env.model.epochs) return self.env.model.epochs
def teach(self, num_timesteps=2000): for t in range(num_timesteps): slopes = [ estimate_slope(timesteps, scores) if len(scores) > 1 else 1 for timesteps, scores in zip(self.timesteps, self.scores) ] p = self.policy(np.abs(slopes) if self.abs else slopes) r, train_done, val_done = self.env.step(p) if val_done: return self.env.model.epochs for a, s in enumerate(r): if not np.isnan(s): self.scores[a].append(s) self.timesteps[a].append(t) if self.writer: for i in range(self.env.num_actions): add_summary( self.writer, "slopes/task_%d_%d" % (i // self.env.max_digits + 1, i % self.env.max_digits + 1), slopes[i], self.env.model.epochs) add_summary( self.writer, "probabilities/task_%d_%d" % (i // self.env.max_digits + 1, i % self.env.max_digits + 1), p[i], self.env.model.epochs) return self.env.model.epochs
def teach(self, num_timesteps=2000): for t in range(num_timesteps): # find slopes for each task if len(self.dscores) > 0: if isinstance(self.policy, ThompsonPolicy): slopes = [ np.random.choice(drs) for drs in np.array(self.dscores).T ] else: slopes = np.mean(self.dscores, axis=0) else: slopes = np.ones(self.env.num_actions) p = self.policy(np.abs(slopes) if self.abs else slopes) r, train_done, val_done = self.env.step(p) if val_done: return self.env.model.epochs # log delta score dr = r - self.prevr self.prevr = r self.dscores.append(dr) if self.writer: for i in range(self.env.num_actions): add_summary(self.writer, "slopes/task_%d" % (i + 1), slopes[i], self.env.model.epochs) add_summary(self.writer, "probabilities/task_%d" % (i + 1), p[i], self.env.model.epochs) return self.env.model.epochs
def step(self, train_dist): print("Training on", train_dist) train_data = self.model.generate_data(train_dist, self.train_size) train_data_double = self.model.generate_data(train_dist, self.train_size) history = self.model.train_epoch(train_data, self.val_data) if self.signal == 'SPG': accs = self.model.accuracy_per_length(*train_data_double) elif self.signal == 'MPG': accs = self.model.accuracy_per_length(*self.val_data) # history = self.model.train_epoch(train_data, self.val_data) # train_accs = self.model.accuracy_per_length(*train_data) # val_accs = self.model.accuracy_per_length(*self.val_data) print('Accuracies: ', accs) train_done = history['full_number_accuracy'][-1] > 0.99 val_done = history['val_full_number_accuracy'][-1] > 0.99 if self.writer: for k, v in history.items(): add_summary(self.writer, "model/" + k, v[-1], self.model.epochs) for i in range(self.num_actions): #add_summary(self.writer, "train_accuracies/task_%d" % (i + 1), train_accs[i], self.model.epochs) add_summary(self.writer, "accuracies/task_%d" % (i + 1), accs[i], self.model.epochs) return accs, train_done, val_done
def teach(self, num_timesteps=2000): curriculum_step = 0 for t in range(num_timesteps): p = self.curriculum[curriculum_step] print(p) r, train_done, val_done = self.env.step(p) if train_done and curriculum_step < len(self.curriculum)-1: curriculum_step = curriculum_step + 1 if val_done: return self.env.model.epochs if self.writer: for i in range(self.env.num_actions): add_summary(self.writer, "probabilities/task_%d" % (i + 1), p[i], self.env.model.epochs) return self.env.model.epochs
def teach(self, num_timesteps=2000): for t in range(num_timesteps // self.window_size): p = self.policy(np.abs(self.Q) if self.abs else self.Q) scores = [[] for _ in range(len(self.Q))] for i in range(self.window_size): r, train_done, val_done = self.env.step(p) if val_done: return self.env.model.epochs for a, score in enumerate(r): if not np.isnan(score): scores[a].append(score) s = [ estimate_slope(list(range(len(sc))), sc) if len(sc) > 1 else 1 for sc in scores ] self.Q += self.lr * (s - self.Q) if self.writer: for i in range(self.env.num_actions): add_summary(self.writer, "Q_values/task_%d" % (i + 1), self.Q[i], self.env.model.epochs) add_summary(self.writer, "slopes/task_%d" % (i + 1), s[i], self.env.model.epochs) add_summary(self.writer, "probabilities/task_%d" % (i + 1), p[i], self.env.model.epochs) return self.env.model.epochs
def teach(self, num_timesteps=2000): for t in range(num_timesteps): p = self.policy(np.abs(self.Q) if self.abs else self.Q) r, train_done, val_done = self.env.step(p) if val_done: return self.env.model.epochs s = r - self.prevr # safeguard against not sampling particular action at all s = np.nan_to_num(s) self.Q += self.lr * (s - self.Q) self.prevr = r if self.writer: for i in range(self.env.num_actions): add_summary( self.writer, "Q_values/task_%d_%d" % (i // self.env.max_digits + 1, i % self.env.max_digits + 1), self.Q[i], self.env.model.epochs) add_summary( self.writer, "slopes/task_%d_%d" % (i // self.env.max_digits + 1, i % self.env.max_digits + 1), s[i], self.env.model.epochs) add_summary( self.writer, "probabilities/task_%d_%d" % (i // self.env.max_digits + 1, i % self.env.max_digits + 1), p[i], self.env.model.epochs) return self.env.model.epochs
def step(self, train_dist): print("Training on", train_dist) train_data = self.model.generate_data(train_dist, self.train_size) history = self.model.train_epoch(train_data, self.val_data) #train_accs = self.model.accuracy_per_length(*train_data) val_accs = self.model.accuracy_per_length(*self.val_data) train_done = history['full_number_accuracy'][-1] > 0.99 val_done = history['val_full_number_accuracy'][-1] > 0.99 if self.writer: for k, v in history.items(): add_summary(self.writer, "model/" + k, v[-1], self.model.epochs) for i in range(self.num_actions): #add_summary(self.writer, "train_accuracies/task_%d_%d" % (i // self.max_digits + 1, i % self.max_digits + 1), train_accs[i], self.model.epochs) add_summary( self.writer, "valid_accuracies/task_%d_%d" % (i // self.max_digits + 1, i % self.max_digits + 1), val_accs[i], self.model.epochs) return val_accs, train_done, val_done
def trainer(num_episodes, fifos, shared_buffer, model, memory, writer): callbacks = [ EarlyStopping(monitor='val_loss', min_delta=0.001, patience=5, verbose=1, mode='auto') ] while num_episodes < args.num_episodes: while True: # pick random fifo (agent) fifo = random.choice(fifos) try: # wait for a new trajectory and statistics trace, reward, rewards, agent_id, hit_probs, avg_lengths, tree_size, entropies, iters_sec = fifo.get( timeout=args.queue_timeout) # break out of the infinite loop break except Empty: # just ignore empty fifos pass num_episodes += 1 # add samples to replay memory # TODO: add_batch would be more efficient? for obs, pi in trace: memory.add_sample(obs, pi, reward) add_summary(writer, "tree/size", tree_size, num_episodes) add_summary(writer, "tree/mean_hit_prob", float(np.mean(hit_probs)), num_episodes) add_summary(writer, "tree/mean_rollout_len", float(np.mean(avg_lengths)), num_episodes) add_summary(writer, "tree/iters_sec", float(np.mean(iters_sec)), num_episodes) add_histogram(writer, "tree/hit_probability", hit_probs, num_episodes) add_histogram(writer, "tree/rollout_length", avg_lengths, num_episodes) add_histogram(writer, "tree/entropies", entropies, num_episodes) add_summary(writer, "episode/mean_entropy", float(np.mean(entropies)), num_episodes) add_summary(writer, "episode/reward", reward, num_episodes) add_summary(writer, "episode/length", len(trace), num_episodes) add_summary(writer, "rewards/agent_id", agent_id, num_episodes) for i in range(len(rewards)): add_summary(writer, "rewards/agent%d" % i, rewards[i], num_episodes) add_summary(writer, "replay_memory/size", memory.size, num_episodes) add_summary(writer, "replay_memory/count", memory.count, num_episodes) add_summary(writer, "replay_memory/current", memory.current, num_episodes) #print("Replay memory size: %d, count: %d, current: %d" % (memory.size, memory.count, memory.current)) X, y, z = memory.dataset() assert len(X) != 0 # reset weights? if args.reset_network: #model.set_weights(init_weights) model = model_from_json(model.to_json()) model.compile(optimizer='adam', loss=['categorical_crossentropy', 'mse']) # train for limited epochs to avoid overfitting? history = model.fit(X, [y, z], batch_size=args.batch_size, epochs=args.num_epochs, callbacks=callbacks, validation_split=args.validation_split) # log loss values for k, v in history.history.items(): add_summary(writer, "training/" + k, v[-1], num_episodes) # shared weights with runners shared_buffer.raw = pickle.dumps(model.get_weights(), pickle.HIGHEST_PROTOCOL) # save weights if num_episodes % args.save_interval == 0: model.save(os.path.join(logdir, "model_%d.hdf5" % num_episodes))
def trainer(num_iters, num_rollouts, model, writer, logdir): while num_iters < args.num_iters: print("################### ITERATION %d ###################" % (num_iters + 1)) # Stats stat_tree_size = [] stat_hit_probs = [] stat_avg_lengths = [] stat_entropies = [] stat_reward_agent = [[], [], [], []] stat_episode_length = [] memory = ReplayMemory() # -------Generate training set based on MCTS------- print("Generate dataset") # use spawn method for starting subprocesses # this seems to be more compatible with TensorFlow? ctx = multiprocessing.get_context('spawn') # create boolean to signal end finished = ctx.Value('i', 0) # create fifos and processes for all runners print("Creating child processes") fifos = [] model_file, _ = get_model_path(args, logdir) for i in range(args.num_runners): fifo = ctx.Queue(1) fifos.append(fifo) process = ctx.Process(target=runner, args=(i, model_file, fifo, finished, args)) process.start() for i in tqdm(range(num_rollouts)): while True: # pick random fifo (agent) fifo = random.choice(fifos) try: # wait for a new trajectory and statistics trace, reward, agent_id, hit_probs, avg_lengths, tree_size, entropies = fifo.get( timeout=1) break except Empty: pass # save stats stat_tree_size.append(tree_size) stat_hit_probs.append(np.mean(hit_probs)) stat_avg_lengths.append(np.mean(avg_lengths)) stat_entropies.append(np.mean(entropies)) stat_reward_agent[agent_id].append(reward) stat_episode_length.append(len(trace)) # add samples to replay memory for obs, pi in trace: memory.add_sample(obs, pi, reward) # Kill subprocesses finished.value = 1 print("Finishing") # empty queues until all child processes have exited while len(multiprocessing.active_children()) > 0: for i, fifo in enumerate(fifos): if not fifo.empty(): fifo.get_nowait() print("All childs was killed") # -------Train a model------- callbacks = [ EarlyStopping(monitor='loss', min_delta=0, patience=5, verbose=1, mode='auto'), ModelCheckpoint(os.path.join(logdir, "model_%d.hdf5" % num_iters), monitor='loss', save_best_only=True), ReduceLROnPlateau(monitor='loss', patience=1, factor=0.1) ] add_summary(writer, "tree/mean_size", np.mean(stat_tree_size), num_iters) try: add_summary(writer, "tree/mean_hit_prob", float(np.mean(stat_hit_probs)), num_iters) except: pass add_summary(writer, "tree/mean_rollout_len", float(np.mean(stat_avg_lengths)), num_iters) add_summary(writer, "episode/mean_entropy", float(np.mean(stat_entropies)), num_iters) try: add_summary(writer, "episode/reward", np.mean(stat_reward_agent), num_iters) except: pass add_summary(writer, "episode/length", np.mean(stat_episode_length), num_iters) add_summary(writer, "rewards/agent_id", agent_id, num_iters) for i in range(len(stat_reward_agent)): try: add_summary(writer, "rewards/agent%d" % i, np.mean(stat_reward_agent[i]), num_iters) except: pass X, y, z = memory.dataset() assert len(X) != 0 # train for limited epochs to avoid overfitting? # TODO class weights?? history = model.fit(X, [y, z], batch_size=args.batch_size, epochs=args.num_epochs, callbacks=callbacks, validation_split=args.validation_split, shuffle=True) # log loss values for k, v in history.history.items(): add_summary(writer, "training/" + k, v[-1], num_iters) num_iters += 1