def log_validation(sess: tf.Session, step: int, summary_writer: tf.summary.FileWriter, test_model: BindedModel): logits, predict, lb, val_loss, losses, test_summary = sess.run( [ test_model.logits, test_model.predict, test_model.dq.batch_labels, test_model.ctc_loss, test_model.ctc_loss_unaggregated, test_model.summary, ], feed_dict={ test_model.learning_phase: 0, }, options=tf.RunOptions( timeout_in_ms=20 * 1000, # Single op should complete in 20s ), ) logger.info( f"Logits[{logits.shape}]: describe:{pformat(stats.describe(logits, axis=None))}" ) if summary_writer is not None: summary_writer.add_summary(test_summary, step) return val_loss
def _log_continuous_evaluation(tb_writer: tf.summary.FileWriter, main_metric: str, eval_result: Evaluation, seen_instances: int, epoch: int, max_epochs: int, execution_results: List[ExecutionResult], train: bool = False, dataset_name: str = None) -> None: """Log the evaluation results and the TensorBoard summaries.""" color, prefix = ("yellow", "train") if train else ("blue", "val") if dataset_name is not None: prefix += "_" + dataset_name eval_string = _format_evaluation_line(eval_result, main_metric) eval_string = "Epoch {}/{} Instances {} {}".format( epoch, max_epochs, seen_instances, eval_string) log(eval_string, color=color) if tb_writer: for result in execution_results: for summaries in [ result.scalar_summaries, result.histogram_summaries, result.image_summaries ]: if summaries is not None: tb_writer.add_summary(summaries, seen_instances) external_str = \ tf.Summary(value=[tf.Summary.Value(tag=prefix + "_" + name, simple_value=value) for name, value in eval_result.items()]) tb_writer.add_summary(external_str, seen_instances)
def evaluate_players(p1: Player, p2: Player, games_per_battle=100, num_battles=100, writer: tf.summary.FileWriter = None, silent: bool = False): p1_wins = [] p2_wins = [] draws = [] game_number = [] game_counter = 0 for i in range(num_battles): p1win, p2win, draw = battle(p1, p2, games_per_battle, silent) p1_wins.append(p1win) p2_wins.append(p2win) draws.append(draw) game_counter = game_counter + 1 game_number.append(game_counter) if writer is not None: summary = tf.Summary(value=[ tf.Summary.Value(tag='Player 1 Win', simple_value=p1win), tf.Summary.Value(tag='Player 2 Win', simple_value=p2win), tf.Summary.Value(tag='Draw', simple_value=draw) ]) writer.add_summary(summary, game_counter) return game_number, p1_wins, p2_wins, draws
def train_step(session: tf.Session, model: CharCnnLstm, train_info: TrainInfo, summary_writer: tf.summary.FileWriter, logger: logging.Logger, report_step: int = 20): if train_info.batch % report_step == 0: loss_value, _, gradient_norm, step, loss_acc_summary = session.run( [ model.loss, model.train_op, model.global_norm, model.global_step, model.loss_acc_summary ], {model.lstm_dropout: 0.5}) summary_writer.add_summary(loss_acc_summary, step) log_level = logging.INFO else: loss_value, _, gradient_norm, step = session.run( [model.loss, model.train_op, model.global_norm, model.global_step], {model.lstm_dropout: 0.5}) log_level = logging.DEBUG elapsed = time.time() - train_info.start_time logger.log( log_level, f'{step:6}: {train_info.epoch} [{train_info.batch:5}/{train_info.nb_of_batches:5}], ' f'train_loss = {loss_value:6.8f} elapsed = {elapsed:.4f}s, grad.norm={gradient_norm:6.8f}' ) return step
def summarize_epoch(epoch: int, sess: tf.Session, learning_rate: tf.Variable, bn_decay: tf.Variable, loss_sum: float, batches_per_epoch: float, acc_sum: float, train_iou_val: float, train_writer: tf.summary.FileWriter, train_iou_reset: tf.Operation): """ summarizes train metrics of one epoch :param epoch: index of epoch :param sess: tf session :param learning_rate: :param bn_decay: :param loss_sum: accumulated loss :param batches_per_epoch: number of batches used in an epoch :param acc_sum: accumulated accuracy :param train_iou_val: accumulated train iou :param train_writer: train summary writer :param train_iou_reset: operation to reset train iou :return: """ lr, bn_d = sess.run([learning_rate, bn_decay]) epoch_loss = loss_sum / batches_per_epoch epoch_acc = acc_sum / batches_per_epoch epoch_iou = train_iou_val print( f"mean loss: {epoch_loss:.4f}\tmean acc: {epoch_acc:.4f}\tmean iou: {epoch_iou:.4f}" ) summary = get_tf_summary(epoch_loss, epoch_acc, epoch_iou) summary.value.add(tag="learning_rate", simple_value=lr) summary.value.add(tag="bn_decay", simple_value=bn_d) train_writer.add_summary(summary, epoch) # reset accumulator sess.run(train_iou_reset)
def save_summaries(metrics: Dict[str, float], writer: tf.summary.FileWriter, global_step: int) -> None: """Log metrics with a tf.summary.FileWriter.""" values = [ tf.Summary.Value(tag=k, simple_value=v) for k, v in metrics.items() ] summary = tf.Summary(value=values) writer.add_summary(summary, global_step) writer.flush()
def np_scalar_to_summary(tag: str, scalar: np.array, np_step: np.array, summary_file_writer: tf.summary.FileWriter): """ Adds a numpy scalar to the logfile. :param tag: The tensorboard plot title. :param scalar: The scalar value to be recordd in that plot. :param np_step: The x-Axis step :param summary_file_writer: The summary writer used to do the recording. """ mse_net_summary = tf.Summary.Value(tag=tag, simple_value=scalar) mse_net_summary = tf.Summary(value=[mse_net_summary]) summary_file_writer.add_summary(mse_net_summary, global_step=np_step)
def _train_epoch(self, sess, global_step_train, decay_step, thresholds, saver, filewriter: tf.summary.FileWriter, optimizer_vars): loss_arr_train = [] def _any_is_nan(ths_to_check): return any(np.isnan(th) for th in ths_to_check) time.sleep( 1 ) # prevent overlapping between the output of tqdm and the standard stream output for batch, _ in tqdm( self.__data_generator_train.generate_batches(self.batch_size)): learning_rate_val = self.learning_rate * \ np.exp(-global_step_train * self.learning_rate_decay) * \ np.abs(np.cos(np.pi * decay_step / 4 / self.reinit_adam_after_n_batches)) + 10.0 ** -7 loss_value, summary_node_val, ths_vals = self._train_step( sess, batch, learning_rate_val, thresholds) loss_arr_train.append(loss_value) filewriter.add_summary(summary_node_val, global_step_train) global_step_train += 1 decay_step += 1 if _any_is_nan(ths_vals): print( "Some thresholds is None, restore previous trainable value" ) saver.restore(sess, tf.train.latest_checkpoint(self.save_dir)) init_opt_vars_op = tf.variables_initializer(optimizer_vars) sess.run(init_opt_vars_op) else: if global_step_train % self.save_each == 0: saver.save( sess, os.path.join(self.save_dir, "ckpt{}".format(global_step_train))) if global_step_train % self.reinit_adam_after_n_batches == 0: init_opt_vars_op = tf.variables_initializer(optimizer_vars) sess.run(init_opt_vars_op) decay_step = 0 time.sleep( 1 ) # prevent overlapping between the output of tqdm and the standard stream output return global_step_train, decay_step, float(np.mean(loss_arr_train))
def perform_validation(self, sess, iteration: int, writer: tf.summary.FileWriter, results_dir: str = None): """ Performs validation over the test data and register the results in the form of summaries that can be interpreted by Tensorboard. The prepare method must have been called at least once before using this method, otherwise, an Exception may occur. :param sess: the current session :param iteration: the current iteration number over the training data :param writer: a FileWriter properly configured :param results_dir: (Optional) the directory where the predicted labels (i.e. the results of the model) should be saved. This can be useful for analysing the detailed results after the training is complete. If the parameter is not provided then the labels are not stored. If provided, they will be stored in: results_dir/predictions/{iteration}.npy as a numpy array :return: None """ sess.run(self.test_iterator.initializer) sess.run( tf.variables_initializer(tf.get_default_graph().get_collection( tf.GraphKeys.METRIC_VARIABLES))) op_list = [self.model.get_output()] op_list.extend(self.update_tensors) true_labels = [] predicted = [] while True: try: test_images, test_target = sess.run([self.test_x, self.test_y]) results = sess.run(op_list, feed_dict={ self.tensor_x: test_images, self.tensor_y: test_target, self.model.use_dropout: 0.0 }) true_labels.extend(test_target) predicted.extend(results[0]) except OutOfRangeError: print( "Finished validation of iteration {}...".format(iteration)) break for tensor in self.scalar_tensors: summary = sess.run(tensor) writer.add_summary(summary, iteration) self._calculate_external_metrics(sess, iteration, writer, true_labels, predicted) Tester.save_predictions(iteration, results_dir, predicted)
def _train_step(self, obs, states, rewards, masks, actions, values, update, writer: tf.summary.FileWriter = None, features=None, rewards_bonuses=None): """ applies a training step to the model :param obs: ([float]) The input observations :param states: ([float]) The states (used for recurrent policies) :param rewards: ([float]) The rewards from the environment :param masks: ([bool]) Whether or not the episode is over (used for recurrent policies) :param actions: ([float]) The actions taken :param values: ([float]) The logits values :param update: (int) the current step iteration :param writer: (TensorFlow Summary.writer) the writer for tensorboard :return: (float, float, float) policy loss, value loss, policy entropy """ advs = rewards - values cur_lr = None for _ in range(len(obs)): cur_lr = self.learning_rate_schedule.value() assert cur_lr is not None, "Error: the observation input array cannon be empty" rewards_bonuses = rewards_bonuses if self.use_sf else np.zeros_like(rewards_bonuses) td_map = {self.train_model.obs_ph: obs, self.actions_ph: actions, self.advs_ph: advs, self.rewards_ph: rewards + rewards_bonuses, self.learning_rate_ph: cur_lr, self.successor_feature_ph: features} if states is not None: td_map[self.train_model.states_ph] = states td_map[self.train_model.masks_ph] = masks if writer is not None: # run loss backprop with summary, but once every 10 runs save the metadata (memory, compute time, ...) if (1 + update) % 10 == 0: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, policy_loss, value_loss, policy_entropy, _, sf_loss = self.sess.run( [self.summary, self.pg_loss, self.vf_loss, self.entropy, self.apply_backprop, self.sf_loss], td_map, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % (update * (self.n_batch + 1))) else: summary, policy_loss, value_loss, policy_entropy, _, sf_loss = self.sess.run( [self.summary, self.pg_loss, self.vf_loss, self.entropy, self.apply_backprop, self.sf_loss], td_map) writer.add_summary(summary, update * (self.n_batch + 1)) else: policy_loss, value_loss, policy_entropy, _, sf_loss = self.sess.run( [self.pg_loss, self.vf_loss, self.entropy, self.apply_backprop, self.sf_loss], td_map) return policy_loss, value_loss, policy_entropy, sf_loss
def train_network(network: DeepRegretNetwork, advantage_memory: buffer.Reservoir, action_indexer: neural_game.ActionIndexer, info_set_vectoriser: neural_game.InfoSetVectoriser, current_time: int, writer: tf.summary.FileWriter, batch_size=1024, num_sgd_updates=4000): """Trains the given network from scratch Args: network: DeepRegretNetwork. The network to train. advantage_memory: Reservoir. Each entry should be an AdvantageMemoryElement. action_indexer: ActionIndexer. Turns actions into indices. info_set_vectoriser: InfoSetVectoriser. Turns information set ids into vectors. current_time: int. The current time. writer: tf.summary.FileWriter. batch_size: int. The size to use for each batch. num_sgd_updates: int. The number of sgd updates to do. Returns: mean_loss: float. The mean loss over the period. """ # First reset the network. network.initialise() losses = [] print("Training.") indices = list(range(len(advantage_memory))) for i in tqdm(range(num_sgd_updates)): # Shuffle the advantage memory. batch_indices = np.random.choice(indices, batch_size, replace=True) batch = advantage_memory.get_elements(batch_indices) loss, summary = network.train(batch, action_indexer, info_set_vectoriser, current_time=current_time) writer.add_summary(summary, network.global_step) losses.append(loss) # Early stopping. if early_stopping_water_mark(losses, num_attempts=20): print("Losses: {}".format(losses)) print("Early stopping.") break print("Losses % through the data: {}".format( [losses[int(frac / 100 * len(losses))] for frac in [0.0, 25.0, 50.0, 75.0, 99.99]] )) return np.min(losses)
def save_loss(sess, loss, iteration: int, writer: tf.summary.FileWriter): """ Saves the loss value into the summary file for TensorBoard :param sess: the current session :param loss: the loss value for the current iteration :param iteration: the current iteration number over the training data :param writer: a FileWriter properly configured :return: None """ loss_tensor = tf.get_default_graph().get_tensor_by_name( 'loss_tensor:0') loss_scalar = tf.get_default_graph().get_tensor_by_name('loss:0') summary = sess.run(loss_scalar, feed_dict={loss_tensor: loss}) writer.add_summary(summary, iteration)
def _calculate_external_metrics(self, sess, iteration: int, writer: tf.summary.FileWriter, true_labels, predictions): """ Calculates the metrics that make use of external libraries. It also saves the results on TensorBoard :param sess: the current session :param iteration: the current iteration number over the training data :param writer: a FileWriter properly configured :param true_labels: array-like structure with the true labels :param predictions: array-like structure with the predicted labels. Must be between 0 and 1 :return: a list containing the calculated values of each metric """ auc = metrics.roc_auc_score(true_labels, predictions, average='macro') auc_summary = self.external_tensors["AUC"] summary = sess.run(auc_summary, feed_dict={self.aux_tensors["AUC_AUX"]: auc}) writer.add_summary(summary, iteration)
def save_episode_to_summary(summary_writer: tensorflow.summary.FileWriter, episode: int, step: int, time: float, reward: float, epsilon: float) -> None: """ Adds summary of episode to summary file NOTE: to view summary execute "tensorboard --logdir output/[algorithm]/tensorboard_summary" :param summary_writer: summary writer :param episode: number of episode :param step: total steps of episode :param time: time needed to complete episode :param reward: total reward received in episode :param epsilon: value of epsilon at the end of episode """ # create summary of episode summary = tensorflow.Summary() summary.value.add(tag='Reward', simple_value=float(reward)) summary.value.add(tag='Step', simple_value=int(step)) summary.value.add(tag='Time', simple_value=float(time)) summary.value.add(tag='Epsilon', simple_value=float(epsilon)) # add summary to file writer summary_writer.add_summary(summary, episode) summary_writer.flush()
def run_validations(session: tf.Session, model: CharCnnLstm, vocab: Vocab, target_tensor: tf.Tensor, summary_writer: tf.summary.FileWriter, step: int, logger: logging.Logger): targets = [] predictions = [] loss = 0 nb_batches = 0 while True: try: batch_loss, batch_target, batch_predictions, variable_summaries = session.run( [ model.loss, target_tensor, model.predictions, model.variable_summaries ], {model.lstm_dropout: 0.}) loss += batch_loss targets.append(batch_target.ravel()) predictions.append(batch_predictions.ravel()) nb_batches += 1 except tf.errors.OutOfRangeError: break targets = np.concatenate(targets) predictions = np.concatenate(predictions) loss /= nb_batches accuracy = np.sum((predictions == targets) & (targets != 0)) / np.sum(targets != 0) logger.info( f'Validation loss = {loss:6.8f}, validation accuracy = {accuracy:6.8f}' ) logger.info('\n' + classification_report_with_labels(targets, predictions, vocab)) summary = tf.Summary(value=[ tf.Summary.Value(tag='loss', simple_value=loss), tf.Summary.Value(tag='accuracy', simple_value=accuracy), ]) summary_writer.add_summary(summary, step) summary_writer.add_summary(variable_summaries, step)
def perform_validation(self, sess, iteration: int, writer: tf.summary.FileWriter): """ Performs validation over the test data and register the results in the form of summaries that can be interpreted by Tensorboard. The prepare method must have been called at least once before using this method, otherwise, an Exception may occur. :param sess: the current session :param iteration: the current iteration number over the training data :param writer: a FileWriter properly configured :return: None """ streaming_accuracy_update = tf.get_default_graph().get_tensor_by_name( 'accuracy_metric/update_op:0') streaming_accuracy_scalar = tf.get_default_graph().get_tensor_by_name( 'accuracy:0') sess.run(self.test_iterator.initializer) sess.run( tf.variables_initializer(tf.get_default_graph().get_collection( tf.GraphKeys.METRIC_VARIABLES))) while True: try: test_images, test_target = sess.run([self.test_x, self.test_y]) sess.run( [streaming_accuracy_update], feed_dict={ self.tensor_x: test_images, self.tensor_y: test_target, self.model.use_dropout: 0.0 }) except OutOfRangeError: print( "Finished validation of iteration {}...".format(iteration)) break summary = sess.run(streaming_accuracy_scalar) writer.add_summary(summary, iteration)
def eval_model(is_training: tf.Variable, sess: tf.Session, best_iou: float, val_loss: tf.Tensor, val_acc: tf.Tensor, val_iou_update: tf.Operation, val_iou: tf.Tensor, val_iou_reset: tf.Operation, val_writer: tf.summary.FileWriter, epoch: int, saver: tf.train.Saver) -> float: """ evaluates model with one pass over validation set :param is_training: tf var which indicates if model is training :param sess: tf sess :param best_iou: best validation iou until now :param val_loss: val loss tensor :param val_acc: val accuracy tensor :param val_iou_update: val iou update operation :param val_iou: val iou tensor :param val_iou_reset: val iou reset operation :param val_writer: val summary writer :param epoch: index of current epoch :param saver: tf model saver :return: new best iou """ acc_sum, loss_sum = 0, 0 # toggle training off assign_op = is_training.assign(False) sess.run(assign_op) val_batches = N_VAL_SAMPLES // BATCH_SIZE print(f"starting evaluation {val_batches} batches") for j in range(val_batches): loss_val, acc_val, _, val_iou_val = sess.run( [val_loss, val_acc, val_iou_update, val_iou]) print( f"\tevaluation epoch: {epoch:03d}\tbatch {j:03d} eval:" f"\tloss: {loss_val:.4f}\taccuracy: {acc_val:.4f}\taccumulated iou {val_iou_val:.4f}" ) acc_sum += acc_val loss_sum += loss_val # validation summary loss = loss_sum / val_batches acc = acc_sum / val_batches iou = val_iou_val summary = get_tf_summary(loss, acc, iou) val_writer.add_summary(summary, epoch) print( f"evaluation:\tmean loss: {loss:.4f}\tmean acc: {acc:.4f}\tmean iou {iou:.4f}\n" ) # save model if it is better if iou > best_iou: best_iou = iou save_path = saver.save( sess, os.path.join(LOG_DIR + "_train", f"best_model_epoch_{epoch:03d}.ckpt")) print(f"Model saved in file: {save_path}\n") # reset accumulator sess.run(val_iou_reset) # toggle training on assign_op = is_training.assign(True) sess.run(assign_op) return best_iou
def train(self, sess: tf.Session, train_X, train_y, valid_X, valid_y, train_summary_writer: tf.summary.FileWriter = None, valid_summary_writer: tf.summary.FileWriter = None, saver: tf.train.Saver = None, ckpt_dir=None, no_embedding=False): self.logger.info("begin to train...") train_data = BatchDataSet(train_X, train_y, self.train_options.batch_size, self.train_options.over_sample) valid_data = BatchDataSet(valid_X, valid_y, self.train_options.batch_size, self.train_options.over_sample) run_options = tf.RunOptions(report_tensor_allocations_upon_oom=True) sess.run(tf.variables_initializer(self.every_train_reset_ops)) early_stop_cnt = 0 valid_acc = [1e-18] valid_losses = [1e18] i_epoch = 0 while i_epoch < self.train_options.max_epoch: if check_early_stop(early_stop_cnt, self.train_options.patient): self.logger.info("early stop!!") break try: x_idx, y_idx = train_data.get_next() feed_dict = { self.x_idx: x_idx, self.y_idx: y_idx, self.dropout_keep_prob: self.train_options.dropout_keep_prob } if no_embedding: _, global_step, loss, predictions = sess.run( [ self.train_op_no_embedding, self.global_step, self.loss, self.predictions ], feed_dict, options=run_options) else: _, global_step, loss, predictions = sess.run( [ self.train_op, self.global_step, self.loss, self.predictions ], feed_dict, options=run_options) accuracy, sc_pre_cnt, sc_pre_right, sc_cnt = cal_accuracy( predictions, y_idx) feed_dict[self.temp_loss] = loss feed_dict[self.temp_acc] = accuracy summaries = sess.run(self.train_summary_op, feed_dict) self.logger.info( "[Train] epoch {}, step {}, nb_batch {}, loss {:g}, acc {:g}. sc_pre_cnt {}, sc_pre_rate {:g}, sc_recall {:g}" .format( i_epoch, global_step, len(y_idx), loss, accuracy, sc_pre_cnt, 0 if sc_pre_cnt == 0 else sc_pre_right / sc_pre_cnt, 0 if sc_cnt == 0 else sc_pre_right / sc_cnt, )) if train_summary_writer: train_summary_writer.add_summary(summaries, global_step) if global_step % self.train_options.check_steps == 0: val_accuracy, val_loss = self.evaluate( sess, valid_data, epoch=i_epoch, global_step=global_step, valid_summary_writer=valid_summary_writer) early_stop_cnt += 1 if val_loss < valid_losses[-1]: # save best performance valid_acc.append(val_accuracy) valid_losses.append(val_loss) early_stop_cnt = 0 if saver and ckpt_dir: self.save(sess, saver, ckpt_dir, global_step) self.logger.info("model improving and saved !") except IndexError: self.logger.info('done reading train data.') train_data.init_iterator() i_epoch += 1
def train_iterations(sess: tf.Session, model: models.basics.BasicSiamese, batch_data: data.BatchData, pairs: pd.DataFrame, summary_writer: tf.summary.FileWriter, batch_size: int, epochs: int): """ Execute the train iterations with all the epochs :param sess: Tensorflow session :param model: Model with a :func:`models.BasicModel.feed_dict` method to get the ``feed_dict`` for ``sess.run(...)`` :param batch_data: Class containing the information for the batch data, it's necessary because it contains information regarding the mean and std of the radiomic features. :param pairs: List of pairs that can be trained. Usually this pairs can be obtained by calling :func:`data.SplitPairs.folds` or :func:`data.SplitPairs.train_test_split` :param summary_writer: Summary writer provided by Tensorflow to show the training progress :param batch_size: Batch size for training Since usually images are used, the whole dataset does not fit in memory so, setting the batch size, can avoid memory overflows. The pairs will be generated by having a number of different ids among all pairs equal to the batch size. :param epochs: Number of epochs, passes through the complete dataset, should be done when training """ # Train iterations final_iterations = 0 sess.run(tf.global_variables_initializer()) for epoch in range(epochs): total_pairs = len(pairs)*(settings.TOTAL_ROTATIONS if model.uses_images() else 1) for i, batch in enumerate(batch_data.batches(pairs, batch_size=batch_size, load_images=model.uses_images(), train=True)): total_pairs -= len(batch.pairs) # Execute graph operations but only write summaries once every 5 iterations if final_iterations % 5 == 0: _, c_index_result, loss, summary = sess.run( [ model.minimizer, model.c_index, model.total_loss, model.summary ], feed_dict=model.feed_dict(batch) ) logger.info(f"Epoch: {epoch:>3}, Batch: {i:>4}, size: {len(batch.pairs):>5}, remaining: " f"{total_pairs:>6}, " f"c-index: {c_index_result:>#5.3}, loss: {loss:>#5.3}") summary_writer.add_summary(summary, final_iterations) else: _, c_index_result, loss = sess.run( [ model.minimizer, model.c_index, model.total_loss ], feed_dict=model.feed_dict(batch) ) final_iterations += 1