def evaluate(hps): images, labels = cifar_input.build_input( FLAGS.dataset, FLAGS.eval_data_path, hps.batch_size, FLAGS.mode) model = resnet_model.ResNet(hps, images, labels, FLAGS.mode) model.build_graph() saver = tf.train.Saver() summary_writer = tf.summary.FileWriter(FLAGS.eval_dir) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) tf.train.start_queue_runners(sess) best_precision = 0.0 while True: try: ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root) except tf.errors.OutOfRangeError as e: tf.logging.error('Cannot restore checkpoint: %s', e) continue if not (ckpt_state and ckpt_state.model_checkpoint_path): tf.logging.info('No model to eval yet at %s', FLAGS.log_root) continue tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path) saver.restore(sess, ckpt_state.model_checkpoint_path) total_prediction, correct_prediction = 0, 0 for _ in six.moves.range(FLAGS.eval_batch_count): (loss, predictions, truth, train_step) = sess.run( [model.cost, model.predictions, model.labels, model.global_step]) truth = np.argmax(truth, axis=1) predictions = np.argmax(predictions, axis=1) correct_prediction += np.sum(truth == predictions) total_prediction += predictions.shape[0] precision = 1.0 * correct_prediction / total_prediction best_precision = max(precision, best_precision) precision_summ = tf.Summary() precision_summ.value.add( tag='Precision', simple_value=precision) summary_writer.add_summary(precision_summ, train_step) best_precision_summ = tf.Summary() best_precision_summ.value.add( tag='Best Precision', simple_value=best_precision) summary_writer.add_summary(best_precision_summ, train_step) tf.logging.info('loss: %.3f, precision: %.3f, best precision: %.3f' % (loss, precision, best_precision)) summary_writer.flush() if FLAGS.eval_once: break time.sleep(60)
def main(_): config_initialization() images, labels = cifar_input.build_input(FLAGS.dataset, FLAGS.eval_data_path, FLAGS.batch_size, mode='eval') eval(images, labels)
def main(train_dir, batch_size, num_batches, log_dir, checkpoint_dir=None): if checkpoint_dir is None: checkpoint_dir = log_dir with tf.device('/cpu:0'): images, labels = build_input('cifar10', 100, 'test') predictions, total_loss = network(images, labels) tf.summary.scalar('loss', total_loss) predictions = tf.to_int32(tf.argmax(predictions, 1)) tf.summary.scalar('accuracy', slim.metrics.accuracy(predictions, labels)) # These are streaming metrics which compute the "running" metric, # e.g running accuracy metrics_to_values, metrics_to_updates = slim.metrics.aggregate_metric_map({ 'accuracy': slim.metrics.streaming_accuracy(predictions, labels), 'streaming_mse': slim.metrics.streaming_mean_squared_error(predictions, labels), }) # Define the streaming summaries to write: for metric_name, metric_value in metrics_to_values.items(): tf.summary.scalar(metric_name, metric_value) # Evaluate every 30 seconds slim.evaluation.evaluation_loop( '', checkpoint_dir, log_dir, num_evals=num_batches, eval_op=list(metrics_to_updates.values()), summary_op=tf.summary.merge_all(), eval_interval_secs=60, max_number_of_evaluations = 100000000)
def __init__(self, data, dataset, eval_batch_count, eval_dir): os.environ["CUDA_VISIBLE_DEVICES"] = "" hps = resnet_model.HParams( batch_size=100, num_classes=100 if dataset == "cifar100" else 10, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer="mom", num_gpus=0) with tf.device("/cpu:0"): # Builds the testing network. images, labels = cifar_input.build_input(data, hps.batch_size, dataset, False) self.model = resnet_model.ResNet(hps, images, labels, "eval") self.model.build_graph() config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) self.model.variables.set_session(sess) init = tf.global_variables_initializer() sess.run(init) # Initializing parameters for tensorboard. self.best_precision = 0.0 self.eval_batch_count = eval_batch_count self.summary_writer = tf.summary.FileWriter(eval_dir, sess.graph) # The IP address where tensorboard logs will be on. self.ip_addr = ray.services.get_node_ip_address()
def __init__(self, data, dataset, eval_batch_count, eval_dir): hps = resnet_model.HParams( batch_size=100, num_classes=100 if dataset == "cifar100" else 10, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer="mom", num_gpus=0) input_images = data[0] input_labels = data[1] with tf.device("/cpu:0"): # Builds the testing network. images, labels = cifar_input.build_input([input_images, input_labels], hps.batch_size, dataset, False) self.model = resnet_model.ResNet(hps, images, labels, "eval") self.model.build_graph() config = tf.ConfigProto(allow_soft_placement=True) sess = tf.Session(config=config) self.model.variables.set_session(sess) self.coord = tf.train.Coordinator() tf.train.start_queue_runners(sess, coord=self.coord) init = tf.global_variables_initializer() sess.run(init) # Initializing parameters for tensorboard. self.best_precision = 0.0 self.eval_batch_count = eval_batch_count self.summary_writer = tf.summary.FileWriter(eval_dir, sess.graph) # The IP address where tensorboard logs will be on. self.ip_addr = ray.services.get_node_ip_address()
def evaluate(hps): """Eval loop.""" images, labels = cifar_input.build_input('cifar10', FLAGS.eval_data_path, hps.batch_size, 'eval') model = resnet_model.ResNet(hps, images, labels, 'eval') model.build_graph() sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) saver = tf.train.Saver() ################################## ## FIXME: Make a summary writer ## ################################## summary_writer = tf.summary.FileWriter(FLAGS.eval_dir) try: ckpt_state = tf.train.get_checkpoint_state(FLAGS.ckpt_dir) except tf.errors.OutOfRangeError as e: tf.logging.error('Cannot restore checkpoint: %s', e) if not (ckpt_state): tf.logging.info('No model to eval yet at %s', FLAGS.ckpt_dir) best_precision = 0. for i in range(len(ckpt_state.all_model_checkpoint_paths)): tf.logging.info('Loading checkpoint %s', ckpt_state.all_model_checkpoint_paths[i]) saver.restore(sess, ckpt_state.all_model_checkpoint_paths[i]) total_prediction, correct_prediction = 0, 0 for _ in six.moves.range(FLAGS.eval_batch_count): (summaries, loss, predictions, truth, train_step) = sess.run([ model.summaries, model.cost, model.predictions, model.labels, model.global_step ]) truth = np.argmax(truth, axis=1) predictions = np.argmax(predictions, axis=1) correct_prediction += np.sum(truth == predictions) total_prediction += predictions.shape[0] precision = 1.0 * correct_prediction / total_prediction best_precision = max(precision, best_precision) ######################################################## ## FIXME: Add summary of precision and best precision ## ######################################################## summ_precision = tf.Summary() summ_precision.value.add(tag='precision', simple_value=precision) summary_writer.add_summary(summ_precision, train_step) summ_best_precision = tf.Summary() summ_best_precision.value.add(tag='best_precision', simple_value=best_precision) summary_writer.add_summary(summ_best_precision, train_step) tf.logging.info('loss: %.3f, precision: %.3f, best precision: %.3f' % (loss, precision, best_precision)) summary_writer.flush()
def _train_resnet_model(self): ''' Shared functionality for defferent resnet model_fn.''' data_path = get_filenames(is_training=True, data_dir=cifar10_data_path) features, labels = cifar_input.build_input(dataset='cifar10', data_path=data_path, batch_size=32, mode='train') tf.summary.image('images', features, max_outputs=6) features = tf.cast(features, dtype=self.dtype) logits = self.network(features, training=True) logits = tf.cast(logits, tf.float32) predictions = { 'classes': tf.argmax(logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } cross_entropy = tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels) # Create a tensor named cross_entropy for logging purposes. tf.identity(cross_entropy, name='cross_entropy') tf.summary.scalar('cross_entropy', cross_entropy) # Add weight decay to the loss. l2_loss = self._weight_decay * tf.add_n([ tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables() ]) tf.summary.scalar('l2_loss', l2_loss) total_loss = cross_entropy + l2_loss global_step = tf.train.get_or_create_global_step() learning_rate = self._learning_rate_fn(global_step) tf.identity(learning_rate, name='learning_rate') tf.summary.scalar('learning_rate', learning_rate) optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9) # TODO: add a loss_scale. if self._loss_scale != 1: scaled_grad_vars = optimizer.compute_gradients(total_loss * self._loss_scale) unscaled_grad_vars = [(grad / self._loss_scale, var) for grad, var in scaled_grad_vars] minimize_op = optimizer.apply_gradients(unscaled_grad_vars) else: minimize_op = optimizer.minimize(total_loss, global_step) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = tf.group(minimize_op, update_ops) # accuracy = tf.metrics.accuracy(labels, predictions['classes']) # metrics = {'accuracy': accuracy} # tf.identity(accuracy[1], name='train_accuracy') # tf.summary.scalar('train_accuracy', accuracy[1]) return train_op, total_loss, predictions, global_step
def train(hps): """Training loop.""" images, labels = cifar_input.build_input( FLAGS.dataset, FLAGS.train_data_path, hps.batch_size, FLAGS.mode) model = resnet_model.ResNet(hps, images, labels, FLAGS.mode) model.build_graph() summary_writer = tf.train.SummaryWriter(FLAGS.train_dir) sv = tf.train.Supervisor(logdir=FLAGS.log_root, is_chief=True, summary_op=None, save_summaries_secs=60, save_model_secs=300, global_step=model.global_step) sess = sv.prepare_or_wait_for_session() step = 0 total_prediction = 0 correct_prediction = 0 precision = 0.0 lrn_rate = 0.1 while not sv.should_stop(): (_, summaries, loss, predictions, truth, train_step) = sess.run( [model.train_op, model.summaries, model.cost, model.predictions, model.labels, model.global_step], feed_dict={model.lrn_rate: lrn_rate}) if train_step < 40000: lrn_rate = 0.1 elif train_step < 60000: lrn_rate = 0.01 elif train_step < 80000: lrn_rate = 0.001 else: lrn_rate = 0.0001 predictions = np.argmax(predictions, axis=1) truth = np.argmax(truth, axis=1) for (t, p) in zip(truth, predictions): if t == p: correct_prediction += 1 total_prediction += 1 precision = float(correct_prediction) / total_prediction correct_prediction = total_prediction = 0 step += 1 if step % 100 == 0: precision_summ = tf.Summary() precision_summ.value.add( tag='Precision', simple_value=precision) summary_writer.add_summary(precision_summ, train_step) summary_writer.add_summary(summaries, train_step) tf.logging.info('loss: %.3f, precision: %.3f\n' % (loss, precision)) summary_writer.flush() sv.Stop()
def evaluate(hps): """Eval loop.""" images, labels = cifar_input.build_input( FLAGS.dataset, FLAGS.eval_data_path, hps.batch_size, FLAGS.mode) model = resnet_model.ResNet(hps, images, labels, FLAGS.mode) model.build_graph() saver = tf.train.Saver() summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) tf.train.start_queue_runners(sess) best_precision = 0.0 while True: time.sleep(60) try: ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root) except tf.errors.OutOfRangeError as e: tf.logging.error('Cannot restore checkpoint: %s', e) continue if not (ckpt_state and ckpt_state.model_checkpoint_path): tf.logging.info('No model to eval yet at %s', FLAGS.log_root) continue tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path) saver.restore(sess, ckpt_state.model_checkpoint_path) total_prediction, correct_prediction = 0, 0 for _ in xrange(FLAGS.eval_batch_count): (summaries, loss, predictions, truth, train_step) = sess.run( [model.summaries, model.cost, model.predictions, model.labels, model.global_step]) best_predictions = np.argmax(predictions, axis=1) truth = np.argmax(truth, axis=1) for (t, p) in zip(truth, best_predictions): if t == p: correct_prediction += 1 total_prediction += 1 precision = 1.0 * correct_prediction / total_prediction best_precision = max(precision, best_precision) precision_summ = tf.Summary() precision_summ.value.add( tag='Precision', simple_value=precision) summary_writer.add_summary(precision_summ, train_step) best_precision_summ = tf.Summary() best_precision_summ.value.add( tag='Best Precision', simple_value=best_precision) summary_writer.add_summary(best_precision_summ, train_step) summary_writer.add_summary(summaries, train_step) tf.logging.info('loss: %.3f, precision: %.3f, best precision: %.3f\n' % (loss, precision, best_precision)) summary_writer.flush() if FLAGS.eval_once: break
def main(train_dir, batch_size, num_batches, log_dir): images, labels = build_input('cifar10', 100, 'train') predictions, total_loss = network(images, labels) tf.summary.scalar('loss', total_loss) optimizer = tf.train.GradientDescentOptimizer(0.1) train_op = slim.learning.create_train_op(total_loss, optimizer, summarize_gradients=True) slim.learning.train(train_op, log_dir, save_summaries_secs=20, save_interval_secs=20)
def main(train_dir, batch_size, num_batches, log_dir, checkpoint_dir=None): if checkpoint_dir is None: checkpoint_dir = log_dir with tf.device('/cpu:0'): images, labels = build_input('cifar10', 100, 'test') logits, logits_cat1, logits_cat2, loss, loss_cat1, loss_cat2, labels_cat1, labels_cat2 = network( images, labels) tf.summary.scalar('losses/loss', loss) tf.summary.scalar('losses/loss_cat1', loss_cat1) tf.summary.scalar('losses/loss_cat2', loss_cat2) logits = tf.argmax(logits, axis=1) logits_cat1 = tf.argmax(logits_cat1, axis=1) logits_cat2 = tf.argmax(logits_cat2, axis=1) tf.summary.scalar('accuracy', slim.metrics.accuracy(logits, tf.to_int64(labels))) tf.summary.scalar( 'accuracy_cat_1', slim.metrics.accuracy(logits_cat1, tf.to_int64(labels_cat1))) tf.summary.scalar( 'accuracy_cat_2', slim.metrics.accuracy(logits_cat2, tf.to_int64(labels_cat2))) # These are streaming metrics which compute the "running" metric, # e.g running accuracy metrics_to_values, metrics_to_updates = slim.metrics.aggregate_metric_map( { 'accuracies/accuracy': slim.metrics.streaming_accuracy(logits, labels), 'accuracies/accuracy_cat_1': slim.metrics.streaming_accuracy(logits_cat1, labels_cat1), 'accuracies/accuracy_cat_2': slim.metrics.streaming_accuracy(logits_cat2, labels_cat2), }) # Define the streaming summaries to write: for metric_name, metric_value in metrics_to_values.items(): tf.summary.scalar(metric_name, metric_value) # Evaluate every 30 seconds slim.evaluation.evaluation_loop('', checkpoint_dir, log_dir, num_evals=num_batches, eval_op=list( metrics_to_updates.values()), summary_op=tf.summary.merge_all(), eval_interval_secs=60, max_number_of_evaluations=100000000)
def train(hps): """Training loop.""" single_gpu_graph = tf.Graph() with single_gpu_graph.as_default(): images, labels = cifar_input.build_input('cifar10', FLAGS.train_data_path, hps.batch_size, 'train') model = resnet_model.ResNet(hps, images, labels, 'train') model.build_graph() truth = tf.argmax(model.labels, axis=1) predictions = tf.argmax(model.predictions, axis=1) precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth))) ######################################################################## #### FIXME: Get session for distributed environments using Parallax #### #### Pass parallax_config as an argument #### ######################################################################## parallax_sess, num_workers, worker_id, num_replicas_per_worker = \ parallax.parallel_run(single_gpu_graph, FLAGS.resource_info_file, sync=FLAGS.sync, parallax_config=parallax_config.build_config()) for i in range(350000): _, global_step, cost, precision_ = \ parallax_sess.run([model.train_op, model.global_step, model.cost, precision]) if i % 10 == 0: print('step: %d, loss: %.3f, precision: %.3f' % (global_step[0], cost[0], precision_[0])) # Tuning learning rate train_step = global_step[0] if train_step < 10000: lrn_rate = 0.1 elif train_step < 15000: lrn_rate = 0.01 elif train_step < 20000: lrn_rate = 0.001 else: lrn_rate = 0.0001 feed_dict = {model.lrn_rate: []} for worker in range(num_replicas_per_worker): feed_dict[model.lrn_rate].append(lrn_rate) parallax_sess.run(model.global_step, feed_dict=feed_dict)
def main(train_dir, batch_size, num_batches, log_dir): images, labels = build_input('cifar10', 100, 'train') predictions, total_loss = network(images, labels) report() tf.summary.scalar('loss', total_loss) predictions = tf.argmax(predictions, axis=1) tf.summary.scalar('accuracy', slim.metrics.accuracy(predictions, tf.to_int64(labels))) optimizer = tf.train.GradientDescentOptimizer(0.1) train_op = slim.learning.create_train_op(total_loss, optimizer, summarize_gradients=True) slim.learning.train(train_op, log_dir, save_summaries_secs=20, save_interval_secs=20)
def evaluate(): eval_batch_count = 50 # faster validation, change to 40000/batch_size for the report validation_error = 0 validation_accuracy = 0 with tf.device('/cpu:0'): with tf.Graph().as_default() as g: images, labels = cifar_input.build_input( 'cifar100', '../../cifar/cifar100/validation.bin', batch_size, 'eval') #TEST.BIN OR VALIDATION.BIN logits = inference(images, NUM_CLASSES=100) saver = tf.train.Saver() losses = loss(logits, labels) accuracies = accuracy(logits, labels) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) tf.train.start_queue_runners(sess) ckpt_state = tf.train.get_checkpoint_state(train_dir) saver.restore(sess, ckpt_state.model_checkpoint_path) for _ in six.moves.range(eval_batch_count): (value_losses, value_accuracy) = sess.run([losses, accuracies]) validation_error += value_losses validation_accuracy += value_accuracy validation_error /= 50 validation_accuracy /= 50 step = str(ckpt_state.model_checkpoint_path).split('-')[1] tf.logging.info('loss: %.3f, best accuracy: %.3f' % (validation_error, validation_accuracy)) f = open(train_dir + "validation_data.csv", 'ab') f.write('{0},{1},{2}\n'.format(step, validation_error, validation_accuracy)) f.close() f = open(train_dir + "log.txt", 'ab') f.write('loss: {0}, best accuracy: {1}\n'.format( validation_error, validation_accuracy)) f.close()
def __init__(self, data, dataset, num_gpus): if num_gpus > 0: os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( [str(i) for i in ray.get_gpu_ids()]) hps = resnet_model.HParams( batch_size=128, num_classes=100 if dataset == "cifar100" else 10, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer="mom", num_gpus=num_gpus) # We seed each actor differently so that each actor operates on a # different subset of data. if num_gpus > 0: tf.set_random_seed(ray.get_gpu_ids()[0] + 1) else: # Only a single actor in this case. tf.set_random_seed(1) input_images = data[0] input_labels = data[1] with tf.device("/gpu:0" if num_gpus > 0 else "/cpu:0"): # Build the model. images, labels = cifar_input.build_input([input_images, input_labels], hps.batch_size, dataset, False) self.model = resnet_model.ResNet(hps, images, labels, "train") self.model.build_graph() config = tf.ConfigProto(allow_soft_placement=True) sess = tf.Session(config=config) self.model.variables.set_session(sess) self.coord = tf.train.Coordinator() tf.train.start_queue_runners(sess, coord=self.coord) init = tf.global_variables_initializer() sess.run(init) self.steps = 10
def __init__(self, data, dataset, num_gpus): if num_gpus > 0: os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( [str(i) for i in ray.get_gpu_ids()]) hps = resnet_model.HParams( batch_size=128, num_classes=100 if dataset == "cifar100" else 10, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer="mom", num_gpus=num_gpus) # We seed each actor differently so that each actor operates on a # different subset of data. if num_gpus > 0: tf.set_random_seed(ray.get_gpu_ids()[0] + 1) else: # Only a single actor in this case. tf.set_random_seed(1) with tf.device("/gpu:0" if num_gpus > 0 else "/cpu:0"): # Build the model. images, labels = cifar_input.build_input(data, hps.batch_size, dataset, False) self.model = resnet_model.ResNet(hps, images, labels, "train") self.model.build_graph() config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) self.model.variables.set_session(sess) init = tf.global_variables_initializer() sess.run(init) self.steps = 10
def main(train_dir, batch_size, num_batches, log_dir): images, labels = build_input('cifar10', 100, 'train') logits, logits_cat1, logits_cat2, loss, loss_cat1, loss_cat2, labels_cat1, labels_cat2 = network( images, labels) report() tf.summary.scalar('losses/loss', loss) tf.summary.scalar('losses/loss_cat1', loss_cat1) tf.summary.scalar('losses/loss_cat2', loss_cat2) logits = tf.argmax(logits, axis=1) logits_cat1 = tf.argmax(logits_cat1, axis=1) logits_cat2 = tf.argmax(logits_cat2, axis=1) tf.summary.scalar('accuracy', slim.metrics.accuracy(logits, tf.to_int64(labels))) tf.summary.scalar( 'accuracy_cat_1', slim.metrics.accuracy(logits_cat1, tf.to_int64(labels_cat1))) tf.summary.scalar( 'accuracy_cat_2', slim.metrics.accuracy(logits_cat2, tf.to_int64(labels_cat2))) optimizer = tf.train.GradientDescentOptimizer(0.1) total_loss = loss_cat2 #total_loss = loss_cat1 #total_loss = loss train_op = slim.learning.create_train_op(total_loss, optimizer, summarize_gradients=True) slim.learning.train(train_op, log_dir, save_summaries_secs=20, save_interval_secs=20)
def evaluate(hps): """Eval loop.""" images, labels = cifar_input.build_input(FLAGS.dataset, FLAGS.eval_data_path, hps.batch_size, FLAGS.mode, hps.data_format) model = resnet_model.ResNet(hps, images, labels, FLAGS.mode) model.build_graph() saver = tf.train.Saver() summary_writer = tf.summary.FileWriter(FLAGS.eval_dir) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) tf.train.start_queue_runners(sess) best_precision = 0.0 while True: try: ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root) except tf.errors.OutOfRangeError as e: tf.logging.error('Cannot restore checkpoint: %s', e) continue if not (ckpt_state and ckpt_state.model_checkpoint_path): tf.logging.info('No model to eval yet at %s', FLAGS.log_root) break tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path) saver.restore(sess, ckpt_state.model_checkpoint_path) global_step = ckpt_state.model_checkpoint_path.split('/')[-1].split( '-')[-1] if not global_step.isdigit(): global_step = 0 else: global_step = int(global_step) total_prediction, correct_prediction, correct_prediction_top5 = 0, 0, 0 start_time = time.time() for _ in six.moves.range(FLAGS.eval_batch_count): (summaries, loss, predictions, truth, train_step) = sess.run([ model.summaries, model.cost, model.predictions, model.labels, model.global_step ]) if not FLAGS.time_inference: for (indiv_truth, indiv_prediction) in zip(truth, predictions): indiv_truth = np.argmax(indiv_truth) top5_prediction = np.argsort(indiv_prediction)[-5:] top1_prediction = np.argsort(indiv_prediction)[-1] correct_prediction += (indiv_truth == top1_prediction) if indiv_truth in top5_prediction: correct_prediction_top5 += 1 total_prediction += 1 if FLAGS.time_inference: print("Time for inference: %.4f" % (time.time() - start_time)) else: precision = 1.0 * correct_prediction / total_prediction precision_top5 = 1.0 * correct_prediction_top5 / total_prediction best_precision = max(precision, best_precision) precision_summ = tf.Summary() precision_summ.value.add(tag='Precision', simple_value=precision) summary_writer.add_summary(precision_summ, train_step) best_precision_summ = tf.Summary() best_precision_summ.value.add(tag='Best Precision', simple_value=best_precision) summary_writer.add_summary(best_precision_summ, train_step) summary_writer.add_summary(summaries, train_step) print('Precision @ 1 = %.4f, Recall @ 5 = %.4f, Global step = %d' % (precision, precision_top5, global_step)) summary_writer.flush() if FLAGS.eval_once: break time.sleep(60)
def train(hps): """Training loop.""" images, labels = cifar_input.build_input(FLAGS.dataset, FLAGS.train_data_path, hps.batch_size, FLAGS.mode) model = incre_resnet_model.ResNet(hps, images, labels, FLAGS.mode) model.build_graph() param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=tf.contrib.tfprof.model_analyzer. TRAINABLE_VARS_PARAMS_STAT_OPTIONS) sys.stdout.write('total_params: %d\n' % param_stats.total_parameters) tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS) truth = tf.argmax(model.labels, axis=1) predictions = tf.argmax(model.predictions, axis=1) precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth))) sub_vars_ = 0 restore_vars = [] for v in tf.trainable_variables(): if v.name.find('bn') == -1 and v.name.find('logit') == -1: restore_vars.append(v) var_ = v #tf.get_default_graph().get_tensor_by_name(v.name+':0') oral_var_ = tf.contrib.framework.load_variable( './ckpt_change/', v.name) oral_var_ = tf.Variable(oral_var_, name=v.name.split(':')[0] + '_oral') sub_vars_ = sub_vars_ + tf.reduce_sum( tf.abs(tf.to_float(var_) - tf.to_float(oral_var_))) saver = tf.train.Saver(restore_vars) incre_loss = 0.001 * sub_vars_ + model.cost trainable_variables = tf.trainable_variables() grads = tf.gradients(incre_loss, trainable_variables) optimizer = tf.train.MomentumOptimizer(model.lrn_rate, 0.5) incre_train_op = optimizer.apply_gradients(zip(grads, trainable_variables), global_step=model.global_step, name='train_step') train_op = [incre_train_op] + model._extra_train_ops train_ops = tf.group(*train_op) summary_hook = tf.train.SummarySaverHook( save_steps=100, output_dir=FLAGS.train_dir, summary_op=tf.summary.merge( [model.summaries, tf.summary.scalar('Precision', precision)])) logging_hook = tf.train.LoggingTensorHook(tensors={ 'step': model.global_step, 'loss': model.cost, 'incre loss': incre_loss, 'precision': precision }, every_n_iter=100) class _LearningRateSetterHook(tf.train.SessionRunHook): """Sets learning_rate based on global step.""" def begin(self): self._lrn_rate = 0.1 def before_run(self, run_context): return tf.train.SessionRunArgs( model.global_step, # Asks for global step value. feed_dict={model.lrn_rate: self._lrn_rate}) # Sets learning rate def after_run(self, run_context, run_values): train_step = run_values.results if train_step < 40000: self._lrn_rate = 0.1 elif train_step < 60000: self._lrn_rate = 0.01 elif train_step < 80000: self._lrn_rate = 0.001 else: self._lrn_rate = 0.0001 with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.log_root, hooks=[logging_hook, _LearningRateSetterHook()], chief_only_hooks=[summary_hook], # Since we provide a SummarySaverHook, we need to disable default # SummarySaverHook. To do that we set save_summaries_steps to 0. save_summaries_steps=0, config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess: saver.restore( mon_sess, '/home/fuxianya/models/research/resnet/ckpt_change/model.ckpt-16072' ) while not mon_sess.should_stop(): mon_sess.run(train_ops)
def train(hps, res_dict): with tf.Graph().as_default(): with tf.device('/cpu:0'): train_images, train_labels = cifar_input.build_input(FLAGS.dataset, FLAGS.train_data_path, 128, 'train') test_images, test_labels = cifar_input.build_input(FLAGS.dataset, FLAGS.eval_data_path, 100, 'eval') lr = tf.placeholder(tf.float32) phase = tf.placeholder(tf.bool) # true for training x_input = tf.cond(phase, lambda:train_images, lambda:test_images) Y_true = tf.cond(phase, lambda:train_labels, lambda:test_labels) loss, precision = vgg16(x_input, Y_true, phase, True) tf.summary.scalar('Precision', precision) global_step = tf.Variable(0, name='global_step', trainable=False) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): opt = tf.train.MomentumOptimizer(lr, 0.9) grads_and_vars = opt.compute_gradients(loss) optimizer = opt.apply_gradients(grads_and_vars, global_step=global_step) # Add histograms for gradients for grad, var in grads_and_vars: if grad is not None: tf.summary.histogram(var.op.name + '/gradients', grad) # Summary op summary_op = tf.summary.merge_all() # Initializer for the variables init = tf.global_variables_initializer() # Saver op saver = tf.train.Saver() writer = tf.summary.FileWriter(FLAGS.res_dir, graph=tf.get_default_graph()) train_learning_rate = FLAGS.training_lr_init TEST_ACCURACY=0.1 compression_ratio = 1.0 zero_layers=0 with tf.Session() as sess: if FLAGS.resume: # restore the model try: ckpt_state = tf.train.get_checkpoint_state(FLAGS.train_dir) except tf.errors.OutOfRangeError as e: tf.logging.error('Cannot restore checkpoint: %s', e) if not (ckpt_state and ckpt_state.model_checkpoint_path): tf.logging.info('No model to load yet at %s', FLAGS.train_dir) tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path) saver.restore(sess, ckpt_state.model_checkpoint_path) model_step = ckpt_state.model_checkpoint_path.split('/')[-1].split('-')[-1] print("\n Resume model was saved at step {}".format(model_step)) print('train steps:{}, model step:{}'.format(TRAIN_STEPS, model_step)) train_learning_rate = FLAGS.resume_lr_init else: sess.run(init) # initialize regularization parameter if FLAGS.use_growl | FLAGS.use_group_lasso: layer_reg_params, hps = reg_params_init(sess, hps) # Start input enqueue threads coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: prev_time = time.clock() while not coord.should_stop(): step = sess.run(global_step) # check whether to decay learning rate if (step+1) % FLAGS.lr_decay_step == 0: train_learning_rate = train_learning_rate * FLAGS.lr_decay_rate if step % BATCH_PER_EPOCH == 0: summary_extend = tf.Summary() _, summary, train_loss, train_accuracy = sess.run([optimizer, summary_op, loss, precision], feed_dict={lr:train_learning_rate, phase:True}) summary_extend.ParseFromString(summary) summary_extend.value.add(tag='testing accuracy', simple_value=TEST_ACCURACY) summary_extend.value.add(tag='compression', simple_value=compression_ratio) writer.add_summary(summary_extend, step) print('step: {}, lr_rate: {}, train_loss: {:.4f}, train_accuracy: {:.4f}'.format(step, train_learning_rate, train_loss, train_accuracy)) else: sess.run([optimizer], feed_dict={lr:train_learning_rate,phase:True}) # save the model and evaluate if (step % FLAGS.checkpoint_freq == 0 and step != 0) or (step == TRAIN_STEPS): print('Checkpoint! Now saving model...') saver.save(sess, FLAGS.train_dir, global_step=step) if step % FLAGS.eval_freq == 0: current_time = time.clock() test_loss = 0 test_accuracy = 0 for i in range(EVAL_BATCHES): test_img_vals = sess.run(test_images) test_label_vals = sess.run(test_labels) test_loss_i, test_accur_i = sess.run([loss, precision], feed_dict={phase:False}) test_loss += test_loss_i test_accuracy += test_accur_i test_loss = test_loss / EVAL_BATCHES TEST_ACCURACY = test_accuracy = test_accuracy / EVAL_BATCHES res_dict['test_accur_arr'].append(test_accuracy) res_dict['training_accur_arr'].append(train_accuracy) res_dict['steps'].append(step) batch_time = (current_time-prev_time)/FLAGS.checkpoint_freq print(' TEST_ACCURACY: {:.4f}, 1 batch takes: {:.4f}'.format(test_accuracy, batch_time)) prev_time = current_time # apply proximal gradient update, and update the mask if (step % REG_APPLY_FREQ==0) and (step>0) and FLAGS.use_sparse_reg: apply_reg_prox(sess, train_learning_rate, layer_reg_params, hps) # update mask zero_layers, layer_ID = update_mask(sess, FLAGS.mask_threshold, hps, res_dict, step) compression_ratio = measure_compression(sess, res_dict, step, True, hps) if zero_layers >= 1: print("There exists zero value layers at step:{0}, layers IDs:{1}".format(step, layer_ID)) coord.request_stop() if ((step >= TRAIN_STEPS) and FLAGS.retrain_on) or ((step % FLAGS.display_similarity_freq==0) and step>1): print("Get the group information! \n") group_info, num_clusters_arr = display_similarity(sess, FLAGS.num_training_epochs, hps, res_dict) np.save(FLAGS.train_dir + 'group_info.npy', group_info) np.save(FLAGS.train_dir + 'num_clusters_arr.npy', num_clusters_arr) if step >= TRAIN_STEPS: coord.request_stop() except tf.errors.OutOfRangeError: np.save(FLAGS.res_dir + 'res_dict.npy', res_dict) print('Done training') finally: coord.request_stop() coord.join(threads) sess.close() return zero_layers, step
def train(hps): """Training loop.""" images, labels = cifar_input.build_input(FLAGS.dataset, FLAGS.train_data_path, hps.batch_size, FLAGS.mode) model = resnet_model.ResNet(hps, images, labels, FLAGS.mode) model.build_graph() param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=tf.contrib.tfprof.model_analyzer. TRAINABLE_VARS_PARAMS_STAT_OPTIONS) sys.stdout.write('total_params: %d\n' % param_stats.total_parameters) tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS) truth = tf.argmax(model.labels, axis=1) predictions = tf.argmax(model.predictions, axis=1) precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth))) summary_hook = tf.train.SummarySaverHook( save_steps=100, output_dir=FLAGS.train_dir, summary_op=tf.summary.merge( [model.summaries, tf.summary.scalar('Precision', precision)])) logging_hook = tf.train.LoggingTensorHook(tensors={ 'step': model.global_step, 'loss': model.cost, 'precision': precision }, every_n_iter=100) class _LearningRateSetterHook(tf.train.SessionRunHook): """Sets learning_rate based on global step.""" def begin(self): self._lrn_rate = 0.1 def before_run(self, run_context): return tf.train.SessionRunArgs( model.global_step, # Asks for global step value. feed_dict={model.lrn_rate: self._lrn_rate}) # Sets learning rate def after_run(self, run_context, run_values): train_step = run_values.results if train_step < 40000: self._lrn_rate = 0.1 elif train_step < 60000: self._lrn_rate = 0.01 elif train_step < 80000: self._lrn_rate = 0.001 else: self._lrn_rate = 0.0001 with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.log_root, hooks=[logging_hook, _LearningRateSetterHook()], chief_only_hooks=[summary_hook], # Since we provide a SummarySaverHook, we need to disable default # SummarySaverHook. To do that we set save_summaries_steps to 0. save_summaries_steps=0, config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(model.train_op)
def evaluate(hps): """Eval loop.""" images, labels = cifar_input.build_input(FLAGS.dataset, FLAGS.eval_data_path, hps.batch_size, FLAGS.mode) model = resnet_model.ResNet(hps, FLAGS.mode) global_step = tf.train.get_or_create_global_step() model.build_graph(images, labels, True) saver = tf.train.Saver() summary_writer = tf.summary.FileWriter(FLAGS.eval_dir) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) tf.train.start_queue_runners(sess) best_precision = 0.0 while True: try: ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root) except tf.errors.OutOfRangeError as e: tf.logging.error('Cannot restore checkpoint: %s', e) continue if not (ckpt_state and ckpt_state.model_checkpoint_path): tf.logging.info('No model to eval yet at %s', FLAGS.log_root) continue tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path) saver.restore(sess, ckpt_state.model_checkpoint_path) vars_ = {} for v in tf.trainable_variables(): vars_[v.name] = v total_prediction, correct_prediction, total_preds = 0, 0, 0 for _ in six.moves.range(FLAGS.eval_batch_count): cost, pred, logits = model.forward_prob(images, labels, vars_) tmp = tf.argmax(labels, axis=1) preds = tf.argmax(pred, axis=1) co_pred = tf.reduce_mean(tf.to_float(tf.equal(preds, tmp))) total_preds += co_pred (summaries, loss, predictions, truth, train_step, total_preds) = sess.run([ model.summaries, model.cost, model.predictions, labels, global_step, total_preds ]) truth = np.argmax(truth, axis=1) predictions = np.argmax(predictions, axis=1) correct_prediction += np.sum(truth == predictions) total_prediction += predictions.shape[0] t_preds = total_preds / FLAGS.eval_batch_count precision = 1.0 * correct_prediction / total_prediction best_precision = max(precision, best_precision) precision_summ = tf.Summary() precision_summ.value.add(tag='Precision', simple_value=precision) summary_writer.add_summary(precision_summ, train_step) best_precision_summ = tf.Summary() best_precision_summ.value.add(tag='Best Precision', simple_value=best_precision) summary_writer.add_summary(best_precision_summ, train_step) summary_writer.add_summary(summaries, train_step) tf.logging.info( 'loss: %.3f, precision: %.3f, best precision: %.3f, t preds: %.3f' % (loss, precision, best_precision, t_preds)) summary_writer.flush() if FLAGS.eval_once: break time.sleep(60)
def train(hps): """Training loop.""" images, labels = cifar_input.build_input(FLAGS.train_data_path, hps.batch_size, FLAGS.mode) model = Net(hps, images, labels, FLAGS.mode) model.build_graph() param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=tf.contrib.tfprof.model_analyzer. TRAINABLE_VARS_PARAMS_STAT_OPTIONS) sys.stdout.write("total_params: %d\n" % param_stats.total_parameters) tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS) truth = tf.argmax(model.labels, axis=1) predictions = tf.argmax(model.predictions, axis=1) labeled_examples = tf.greater(tf.reduce_max(model.labels, axis=1), tf.zeros([hps.batch_size, 1])) labeled_examples = tf.cast(labeled_examples, tf.float32) correct_predictions = tf.cast(tf.equal(predictions, truth), tf.float32) correct_predictions = tf.multiply(correct_predictions, labeled_examples) precision = tf.reduce_sum(correct_predictions) / tf.reduce_sum( labeled_examples) summary_hook = tf.train.SummarySaverHook( save_steps=100, output_dir=FLAGS.train_dir, summary_op=tf.summary.merge( [model.summaries, tf.summary.scalar("Precision", precision)]), ) logging_hook = tf.train.LoggingTensorHook( tensors={ "step": model.global_step, "loss": model.cost, "wmc": model.wmc, "cross_entropy": model.cross_entropy, "precision": precision, }, every_n_iter=100, ) class _LearningRateSetterHook(tf.train.SessionRunHook): """Sets learning_rate based on global step.""" def begin(self): self._lrn_rate = 0.1 def before_run(self, run_context): return tf.train.SessionRunArgs( model.global_step, feed_dict={model.lrn_rate: self._lrn_rate}) def after_run(self, run_context, run_values): train_step = run_values.results if train_step < 10000: self._lrn_rate = 0.1 elif train_step < 20000: self._lrn_rate = 0.05 elif train_step < 35000: self._lrn_rate = 0.01 else: self._lrn_rate = 0.001 with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.log_root, hooks=[logging_hook, _LearningRateSetterHook()], chief_only_hooks=[summary_hook], save_summaries_steps=0, config=tf.ConfigProto(allow_soft_placement=True), ) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(model.train_op)
def evaluate(hps): # 构建输入数据(读取队列执行器) images, labels = cifar_input.build_input( FLAGS.dataset, FLAGS.eval_data_path, hps.batch_size, FLAGS.mode) # 构建残差网络模型 model = resnet_model.ResNet(hps, images, labels, FLAGS.mode) model.build_graph() # 模型变量存储器 saver = tf.train.Saver() # 总结文件 生成器 summary_writer = tf.summary.FileWriter(FLAGS.eval_dir) # 执行Session sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # 启动所有队列执行器 tf.train.start_queue_runners(sess) best_precision = 0.0 while True: # 检查checkpoint文件 try: ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root) except tf.errors.OutOfRangeError as e: tf.logging.error('Cannot restore checkpoint: %s', e) continue if not (ckpt_state and ckpt_state.model_checkpoint_path): tf.logging.info('No model to eval yet at %s', FLAGS.log_root) continue # 读取模型数据(训练期间生成) tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path) saver.restore(sess, ckpt_state.model_checkpoint_path) # 逐Batch执行测试 total_prediction, correct_prediction = 0, 0 for _ in six.moves.range(FLAGS.eval_batch_count): # 执行预测 (loss, predictions, truth, train_step) = sess.run( [model.cost, model.predictions, model.labels, model.global_step]) # 计算预测结果 truth = np.argmax(truth, axis=1) predictions = np.argmax(predictions, axis=1) correct_prediction += np.sum(truth == predictions) total_prediction += predictions.shape[0] # 计算准确率 precision = 1.0 * correct_prediction / total_prediction best_precision = max(precision, best_precision) # 添加准确率总结 precision_summ = tf.Summary() precision_summ.value.add( tag='Precision', simple_value=precision) summary_writer.add_summary(precision_summ, train_step) # 添加最佳准确总结 best_precision_summ = tf.Summary() best_precision_summ.value.add( tag='Best Precision', simple_value=best_precision) summary_writer.add_summary(best_precision_summ, train_step) # 添加测试总结 #summary_writer.add_summary(summaries, train_step) # 打印日志 tf.logging.info('loss: %.3f, precision: %.3f, best precision: %.3f' % (loss, precision, best_precision)) # 执行写文件 summary_writer.flush() if FLAGS.eval_once: break time.sleep(60)
def train(hps): # 构建输入数据(读取队列执行器) images, labels = cifar_input.build_input( FLAGS.dataset, FLAGS.train_data_path, hps.batch_size, FLAGS.mode) # 构建残差网络模型 model = resnet_model.ResNet(hps, images, labels, FLAGS.mode) model.build_graph() # 计算预测准确率 truth = tf.argmax(model.labels, axis=1) predictions = tf.argmax(model.predictions, axis=1) precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth))) # 建立总结存储器,每100步存储一次 summary_hook = tf.train.SummarySaverHook( save_steps=100, output_dir=FLAGS.train_dir, summary_op=tf.summary.merge( [model.summaries, tf.summary.scalar('Precision', precision)])) # 建立日志打印器,每100步打印一次 logging_hook = tf.train.LoggingTensorHook( tensors={'step': model.global_step, 'loss': model.cost, 'precision': precision}, every_n_iter=100) # 学习率更新器,基于全局Step class _LearningRateSetterHook(tf.train.SessionRunHook): def begin(self): #初始学习率 self._lrn_rate = 0.1 def before_run(self, run_context): return tf.train.SessionRunArgs( # 获取全局Step model.global_step, # 设置学习率 feed_dict={model.lrn_rate: self._lrn_rate}) def after_run(self, run_context, run_values): # 动态更新学习率 train_step = run_values.results if train_step < 40000: self._lrn_rate = 0.1 elif train_step < 60000: self._lrn_rate = 0.01 elif train_step < 80000: self._lrn_rate = 0.001 else: self._lrn_rate = 0.0001 # 建立监控Session with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.log_root, hooks=[logging_hook, _LearningRateSetterHook()], chief_only_hooks=[summary_hook], # 禁用默认的SummarySaverHook,save_summaries_steps设置为0 save_summaries_steps=0, config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess: while not mon_sess.should_stop(): # 执行优化训练操作 mon_sess.run(model.train_op)
def main(_): config_initialization() images, labels = cifar_input.build_input(FLAGS.dataset, FLAGS.train_data_path, FLAGS.batch_size, mode='train') train_op, train_step_fn = create_train_op(images, labels) train(train_op, train_step_fn)
def main(_): inputs, labels = cifar_input.build_input(FLAGS.dataset, FLAGS.eval_data_path, FLAGS.batch_size, 'eval') is_training = True with slim.arg_scope(nets.resnet_v2.resnet_arg_scope()): net, endpoints = nets.resnet_v2.resnet_v2_101(inputs, num_classes=None, is_training=is_training) with tf.variable_scope('Logits'): net = tf.squeeze(net, axis=[1, 2]) net = slim.dropout(net, keep_prob=0.5, scope='scope') logits = slim.fully_connected(net, num_outputs=FLAGS.num_classes, activation_fn=None, scope='fc') # 有选择地恢复变量 checkpoint_exclude_scopes = 'Logits' exclusions = None if checkpoint_exclude_scopes: exclusions = [ scope.strip() for scope in checkpoint_exclude_scopes.split(',') ] variables_to_restore = [] for var in slim.get_model_variables(): excluded = False for exclusion in exclusions: if var.op.name.startswith(exclusion): excluded = True if not excluded: variables_to_restore.append(var) logits = tf.nn.softmax(logits) classes = tf.argmax(logits, axis=1, name='classes') accuracy = tf.reduce_mean( tf.cast(tf.equal(tf.cast(classes, dtype=tf.int32), labels), dtype=tf.float32)) # 获取最新的模型 ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_model_path) saver_restore = tf.train.Saver() with tf.Session() as sess: # 载入训练模型 saver_restore.restore(sess, ckpt.model_checkpoint_path) # 开启队列 coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) correct_prediction = 0 for i in range(FLAGS.num_steps): correct_prediction += sess.run(accuracy) # 输出测试情况 precision = correct_prediction / FLAGS.num_steps validate_log = 'Validation precision: {:.4f}'.format(precision) print(validate_log) # 关闭队列 coord.request_stop() coord.join(threads)
def train(hps): """Training loop.""" class_loss = [] images1, labels1 = cifar_input.build_input( FLAGS.dataset, '/home/fuxianya/data/bin/train_batch', hps.batch_size, FLAGS.mode) model = resnet_model.ResNet(hps, FLAGS.mode) model.build_graph(images1, labels1, True) param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=tf.contrib.tfprof.model_analyzer. TRAINABLE_VARS_PARAMS_STAT_OPTIONS) sys.stdout.write('total_params: %d\n' % param_stats.total_parameters) tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS) truth = tf.argmax(labels1, axis=1) predictions = tf.argmax(model.predictions, axis=1) precision_o = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth))) loss_o = model.cost vars_ = {} for v in tf.trainable_variables(): #if v.name.find('bn')==-1: # print(v.name) vars_[v.name] = v cost, pred, logits = model.forward_prob(images1, labels1, vars_, True) inner_grad = tf.gradients(cost, list(vars_.values())) inner_grad = [tf.stop_gradient(grad) for grad in inner_grad] inner_grad_dict = dict(zip(vars_.keys(), inner_grad)) new_vars = dict( zip(vars_.keys(), [ vars_[key] - model.lrn_rate * inner_grad_dict[key] for key in vars_.keys() ])) class_preds = [] costb = [] for i in range(0, 10): class_image, class_label = cifar_input.build_input( FLAGS.dataset, FLAGS.train_data_path + '_' + str(i), hps.batch_size, FLAGS.mode) cost1, pred, _ = model.forward_prob(class_image, class_label, new_vars, True) costb.append(cost1) tmp = tf.argmax(class_label, axis=1) preds = tf.argmax(pred, axis=1) co_pred = tf.reduce_mean(tf.to_float(tf.equal(preds, tmp))) class_preds.append(co_pred) meta_loss = tf.to_float(0.5) * tf.reduce_sum(costb) / tf.to_float( 10) + tf.to_float(0.5) * loss_o #meta_loss = tf.reduce_mean(costb, 0, keep_dims=True) #meta_optimizer = tf.train.AdamOptimizer(model.lrn_rate).minimize(meta_loss, global_step=global_step) trainable_variables = tf.trainable_variables() grads = tf.gradients(meta_loss, trainable_variables) optimizer = tf.train.MomentumOptimizer(model.lrn_rate, 0.9) meta_train_op = optimizer.apply_gradients(zip(grads, trainable_variables), global_step=model.global_step, name='train_step') train_op = [meta_train_op] + model.extra_train_ops train_ops = tf.group(*train_op) total_accs = tf.reduce_sum(class_preds) / tf.to_float(10) ''' gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.666) config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options) sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) for i in range(2000): sess.run(meta_train_op) l, p, tl, tp = sess.run([loss_o, precision_o, meta_loss, total_accs]) print('epoch:%d, loaa_0:%f, acc_0:%f, loss:%f, acc:%f'%(l,p,tl,tp)) saver.save(sess, 'ckpt/model.ckpt', global_step=i+1) ''' summary_hook = tf.train.SummarySaverHook( save_steps=100, output_dir=FLAGS.train_dir, summary_op=tf.summary.merge( [model.summaries, tf.summary.scalar('Precision', total_accs)])) logging_hook = tf.train.LoggingTensorHook(tensors={ 'step': model.global_step, 'loss_o': loss_o, 'precision_o': precision_o, 'total precision': total_accs, 'total losses': meta_loss }, every_n_iter=100) class _LearningRateSetterHook(tf.train.SessionRunHook): """Sets learning_rate based on global step.""" def begin(self): self._lrn_rate = 0.1 def before_run(self, run_context): return tf.train.SessionRunArgs( model.global_step, # Asks for global step value. feed_dict={model.lrn_rate: self._lrn_rate}) # Sets learning rate def after_run(self, run_context, run_values): train_step = run_values.results if train_step < 2000: self._lrn_rate = 0.1 elif train_step < 4000: self._lrn_rate = 0.01 elif train_step < 6000: self._lrn_rate = 0.001 else: self._lrn_rate = 0.0001 gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9) epoch = 0 with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.log_root, hooks=[logging_hook, _LearningRateSetterHook()], chief_only_hooks=[summary_hook], # Since we provide a SummarySaverHook, we need to disable default # SummarySaverHook. To do that we set save_summaries_steps to 0. save_summaries_steps=0, config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)) as mon_sess: while not mon_sess.should_stop() and epoch < 4000: #mon_sess.run(meta_optimizer) #mon_sess.run([meta_loss, meta_train_op]) #mon_sess.run(meta_train_op) mon_sess.run(train_ops) epoch = epoch + 1
def train(hps): """Training loop.""" images, labels = cifar_input.build_input(FLAGS.dataset, FLAGS.train_data_path, hps.batch_size, FLAGS.mode, hps.data_format) model = resnet_model.ResNet(hps, images, labels, FLAGS.mode) model.build_graph() param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=tf.contrib.tfprof.model_analyzer. TRAINABLE_VARS_PARAMS_STAT_OPTIONS) sys.stdout.write('total_params: %d\n' % param_stats.total_parameters) tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS) truth = tf.argmax(model.labels, axis=1) predictions = tf.argmax(model.predictions, axis=1) precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth))) summary_hook = tf.train.SummarySaverHook( save_steps=100, output_dir=FLAGS.train_dir, summary_op=tf.summary.merge( [model.summaries, tf.summary.scalar('Precision', precision)])) num_steps_per_epoch = 391 # TODO: Don't hardcode this. logging_hook = tf.train.LoggingTensorHook(tensors={ 'step': model.global_step, 'loss': model.cost, 'precision': precision }, every_n_iter=100) class _LearningRateSetterHook(tf.train.SessionRunHook): """Sets learning_rate based on global step.""" def begin(self): self._lrn_rate = 0.01 def before_run(self, run_context): return tf.train.SessionRunArgs( model.global_step, # Asks for global step value. feed_dict={model.lrn_rate: self._lrn_rate}) # Sets learning rate def after_run(self, run_context, run_values): train_step = run_values.results if train_step < num_steps_per_epoch: self._lrn_rate = 0.01 elif train_step < (91 * num_steps_per_epoch): self._lrn_rate = 0.1 elif train_step < (136 * num_steps_per_epoch): self._lrn_rate = 0.01 elif train_step < (181 * num_steps_per_epoch): self._lrn_rate = 0.001 else: self._lrn_rate = 0.0001 class _SaverHook(tf.train.SessionRunHook): """Sets learning_rate based on global step.""" def begin(self): self.saver = tf.train.Saver(max_to_keep=10000) subprocess.call("rm -rf %s; mkdir -p %s" % (FLAGS.checkpoint_dir, FLAGS.checkpoint_dir), shell=True) self.f = open(os.path.join(FLAGS.checkpoint_dir, "times.log"), 'w') def after_create_session(self, sess, coord): self.sess = sess self.start_time = time.time() def before_run(self, run_context): return tf.train.SessionRunArgs( model.global_step # Asks for global step value. ) def after_run(self, run_context, run_values): train_step = run_values.results epoch = train_step / num_steps_per_epoch if train_step % num_steps_per_epoch == 0: end_time = time.time() directory = os.path.join(FLAGS.checkpoint_dir, ("%5d" % epoch).replace(' ', '0')) subprocess.call("mkdir -p %s" % directory, shell=True) ckpt_name = 'model.ckpt' self.saver.save(self.sess, os.path.join(directory, ckpt_name), global_step=train_step) self.f.write("Step: %d\tTime: %s\n" % (train_step, end_time - self.start_time)) print("Saved checkpoint after %d epoch(s) to %s..." % (epoch, directory)) sys.stdout.flush() self.start_time = time.time() def end(self, sess): self.f.close() with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.log_root, hooks=[logging_hook, _LearningRateSetterHook()], chief_only_hooks=[summary_hook, _SaverHook()], save_checkpoint_secs=None, # Since we provide a SummarySaverHook, we need to disable default # SummarySaverHook. To do that we set save_summaries_steps to 0. save_summaries_steps=None, save_summaries_secs=None, config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess: for i in range(num_steps_per_epoch * 181): mon_sess.run(model.train_op)
def train(hps, model, dir_name=None): """Training loop.""" if dir_name == None: dir_name = FLAGS.data_dir + "/" + FLAGS.model_dir if FLAGS.dataset == 'mnist': mnist = tf.contrib.learn.datasets.load_dataset("mnist") dataset = mnist.train images = tf.placeholder(tf.float32, [hps.batch_size, 784], name='x-input') labels = tf.placeholder(tf.int64, [hps.batch_size], name='y-input') elif FLAGS.dataset == 'cifar10' or FLAGS.dataset == 'cifar100': images, labels = cifar_input.build_input(FLAGS.dataset, FLAGS.eval_data_path, hps.batch_size, hps.image_standardization, FLAGS.mode) model = model.Model(hps, images, labels, FLAGS.mode) model.build_graph() param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=tf.contrib.tfprof.model_analyzer. TRAINABLE_VARS_PARAMS_STAT_OPTIONS) sys.stdout.write('total_params: %d\n' % param_stats.total_parameters) tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS) truth = tf.argmax(model.labels, axis=1) predictions = tf.argmax(model.predictions, axis=1) one_hot_preds = tf.one_hot(predictions, depth=hps.num_classes, dtype=tf.float32) votes = tf.reshape(one_hot_preds, [hps.n_draws, hps.batch_size, hps.num_classes]) predictions = tf.argmax(tf.reduce_sum(votes, axis=0), axis=1) precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth))) summary_hook = tf.train.SummarySaverHook( save_steps=100, output_dir=dir_name, summary_op=tf.summary.merge( [model.summaries, tf.summary.scalar('Precision', precision)])) logging_hook = tf.train.LoggingTensorHook(tensors={ 'step': model.global_step, 'loss': model.cost, 'precision': precision }, every_n_iter=100) class _LearningRateSetterHook(tf.train.SessionRunHook): """Sets learning_rate based on global step.""" def begin(self): self._lrn_rate = 0.1 self._schedule = list(zip(hps.lrn_rte_changes, hps.lrn_rte_vals)) def before_run(self, run_context): return tf.train.SessionRunArgs( model.global_step, # Asks for global step value. feed_dict={model.lrn_rate: self._lrn_rate}) # Sets learning rate def after_run(self, run_context, run_values): train_step = run_values.results if len(self._schedule) > 0 and train_step >= self._schedule[0][0]: # Update learning rate according to the schedule. self._lrn_rate = self._schedule[0][1] self._schedule = self._schedule[1:] print("START TRAINING") steps = 0 with tf.train.MonitoredTrainingSession( checkpoint_dir=dir_name, hooks=[ logging_hook, _LearningRateSetterHook(), tf.train.StopAtStepHook(last_step=FLAGS.max_steps), ], chief_only_hooks=[summary_hook], # Since we provide a SummarySaverHook, we need to disable default # SummarySaverHook. To do that we set save_summaries_steps to 0. save_summaries_steps=0, config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess: while not mon_sess.should_stop(): s = 1.0 - min(0.99975**steps, 0.9) if s > 0.9: s = 1.0 # this triggers around 10k steps if FLAGS.dataset == 'mnist': xs, ys = dataset.next_batch(hps.batch_size, fake_data=False) args = { model.noise_scale: s, model._images: xs, model._labels: ys } elif FLAGS.dataset == 'cifar10' or FLAGS.dataset == 'cifar100': args = {model.noise_scale: s} mon_sess.run(model.train_op, args) steps += 1
def train(hps): """Training loop.""" images, labels = cifar_input.build_input( FLAGS.dataset, FLAGS.train_data_path, hps.batch_size, FLAGS.mode) model = resnet_model.ResNet(hps, images, labels, FLAGS.mode) model.build_graph() param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=tf.contrib.tfprof.model_analyzer. TRAINABLE_VARS_PARAMS_STAT_OPTIONS) sys.stdout.write('total_params: %d\n' % param_stats.total_parameters) tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS) truth = tf.argmax(model.labels, axis=1) predictions = tf.argmax(model.predictions, axis=1) precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth))) summary_hook = tf.train.SummarySaverHook( save_steps=100, output_dir=FLAGS.train_dir, summary_op=[model.summaries, tf.summary.scalar('Precision', precision)]) logging_hook = tf.train.LoggingTensorHook( tensors={'step': model.global_step, 'loss': model.cost, 'precision': precision}, every_n_iter=100) class _LearningRateSetterHook(tf.train.SessionRunHook): """Sets learning_rate based on global step.""" def begin(self): self._lrn_rate = 0.1 def before_run(self, run_context): return tf.train.SessionRunArgs( model.global_step, # Asks for global step value. feed_dict={model.lrn_rate: self._lrn_rate}) # Sets learning rate def after_run(self, run_context, run_values): train_step = run_values.results if train_step < 40000: self._lrn_rate = 0.1 elif train_step < 60000: self._lrn_rate = 0.01 elif train_step < 80000: self._lrn_rate = 0.001 else: self._lrn_rate = 0.0001 with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.log_root, hooks=[logging_hook, _LearningRateSetterHook()], chief_only_hooks=[summary_hook], # Since we provide a SummarySaverHook, we need to disable default # SummarySaverHook. To do that we set save_summaries_steps to 0. save_summaries_steps=0, config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(model.train_op)
def evaluate(hps, model, dir_name=None, rerun=False): """Evaluate the ResNet and log prediction counters to compute sensitivity.""" if dir_name == None: dir_name = FLAGS.data_dir + "/" + FLAGS.model_dir if os.path.isfile(dir_name + "/eval_data.json") and not rerun: # run only new models return if FLAGS.dataset == 'mnist': mnist = tf.contrib.learn.datasets.load_dataset("mnist") dataset = mnist.test images = tf.placeholder(tf.float32, [hps.batch_size, 784], name='x-input') labels = tf.placeholder(tf.int64, [hps.batch_size], name='y-input') elif FLAGS.dataset == 'cifar10' or FLAGS.dataset == 'cifar100': images, labels = cifar_input.build_input(FLAGS.dataset, FLAGS.eval_data_path, hps.batch_size, hps.image_standardization, FLAGS.mode) model = model.Model(hps, images, labels, FLAGS.mode) model.build_graph() saver = tf.train.Saver() summary_writer = tf.summary.FileWriter(dir_name) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) tf.train.start_queue_runners(sess) best_precision = 0.0 try: ckpt_state = tf.train.get_checkpoint_state(dir_name) except tf.errors.OutOfRangeError as e: tf.logging.error('Cannot restore checkpoint: %s', e) tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path) saver.restore(sess, ckpt_state.model_checkpoint_path) # Make predictions on the dataset, keep the label distribution data = { 'predictions': [], 'pred_truth': [], } total_prediction, correct_prediction = 0, 0 eval_data_size = FLAGS.eval_data_size eval_batch_size = hps.batch_size eval_batch_count = int(eval_data_size / eval_batch_size) for i in six.moves.range(eval_batch_count): if FLAGS.dataset == 'mnist': xs, ys = dataset.next_batch(hps.batch_size, fake_data=False) args = { model.noise_scale: 1.0, model._images: xs, model._labels: ys } elif FLAGS.dataset == 'cifar10' or FLAGS.dataset == 'cifar100': args = {model.noise_scale: 1.0} ( summaries, loss, predictions, truth, train_step, ) = sess.run([ model.summaries, model.cost, model.predictions, model.labels, model.global_step, ], args) print("Done: {}/{}".format(eval_batch_size * i, eval_data_size)) truth = np.argmax(truth, axis=1)[:hps.batch_size] prediction_votes = np.zeros([hps.batch_size, hps.num_classes]) predictions = np.argmax(predictions, axis=1) for i in range(hps.n_draws): for j in range(hps.batch_size): prediction_votes[j, predictions[i * hps.batch_size + j]] += 1 predictions = np.argmax(prediction_votes, axis=1) data['predictions'] += prediction_votes.tolist() data['pred_truth'] += (truth == predictions).tolist() print("{} / {}".format(np.sum(truth == predictions), len(predictions))) correct_prediction += np.sum(truth == predictions) total_prediction += predictions.shape[0] current_precision = 1.0 * correct_prediction / total_prediction print(current_precision) print() # For Parseval, get true sensitivity, use to rescale the actual attack # bound as the nosie assumes this to be 1 but often it is not. if hps.noise_scheme == 'l2_l2_s1': # Parseval updates usually have a sensitivity higher than 1 # despite the projection: we need to rescale when computing # sensitivity. sensitivity_multiplier = float( sess.run(model.sensitivity_multiplier, {model.noise_scale: 1.0})) else: sensitivity_multiplier = 1.0 with open(dir_name + "/sensitivity_multiplier.json", 'w') as f: d = [sensitivity_multiplier] f.write(json.dumps(d)) # Compute robustness and add it to the eval data. dp_mechs = { 'l2_l2_s1': 'gaussian', 'l1_l2_s1': 'gaussian', 'l1_l1_s1': 'laplace', 'l1_l1': 'laplace', 'l1_l2': 'gaussian', 'l2': 'gaussian', 'l1': 'laplace', } robustness = [ utils.robustness_size(counts=x, dp_attack_size=hps.attack_norm_bound, dp_epsilon=1.0, dp_delta=0.05, dp_mechanism=dp_mechs[hps.noise_scheme]) / sensitivity_multiplier for x in data['predictions'] ] data['robustness'] = robustness data['sensitivity_mult_used'] = sensitivity_multiplier # Log eval data with open(dir_name + "/eval_data.json", 'w') as f: f.write(json.dumps(data)) # Print stuff precision = 1.0 * correct_prediction / total_prediction best_precision = max(precision, best_precision) precision_summ = tf.Summary() precision_summ.value.add(tag='Precision', simple_value=precision) summary_writer.add_summary(precision_summ, train_step) best_precision_summ = tf.Summary() best_precision_summ.value.add(tag='Best Precision', simple_value=best_precision) summary_writer.add_summary(best_precision_summ, train_step) summary_writer.add_summary(summaries, train_step) tf.logging.info('loss: %.3f, precision: %.3f, best precision: %.3f' % (loss, precision, best_precision)) summary_writer.flush()
def get_model(hps, dataset, train_data_path, mode='train'): images, labels = cifar_input.build_input(dataset, train_data_path, hps.batch_size, mode) model = resnet_model.ResNet(hps, images, labels, mode) model.build_graph() return model
def train(args): """ Complete training and validation script. Additionally, saves inference model, trained weights and summaries. Parameters ---------- args : argparse.parser object contains all necessary command line arguments Returns ------- """ if not args.resume: os.system("rm -rf %s" % args.save_path) os.system("mkdir -p %s" % args.save_path) else: print('Resuming training') num_classes = 10 batch_size = 128 # Define and build the network graph net = ResNet(num_classes, strides=[[1, 1], [1, 1], [2, 2], [2, 2]]) # Parse the csv files and define input ops for training and validation I/O print('Loading data from {}'.format(args.data_dir)) x_train, y_train = reader.build_input( 'cifar10', os.path.join(args.data_dir, 'data_batch*'), batch_size, 'train') # Define training metrics and optimisation ops train_net = net(x_train) train_logits_ = train_net['logits'] train_pred_ = train_net['y_'] train_truth_ = y_train train_acc_ = tf.reduce_mean( tf.cast( tf.equal(tf.cast(train_truth_, tf.int32), tf.cast(train_pred_, tf.int32)), tf.float32)) modules.scalar_summary(train_acc_, 'train/acc', collections=['losses', 'metrics']) ce = modules.sparse_crossentropy(train_logits_, train_truth_, name='train/loss', collections=['losses', 'training']) l2 = modules.l2_regularization(net.get_variables(tf.GraphKeys.WEIGHTS), 0.0002, name='train/l2', collections=['training', 'regularization']) train_loss_ = ce + l2 lr_placeholder = tf.placeholder(tf.float32) train_op_ = tf.train.MomentumOptimizer(lr_placeholder, 0.9).minimize(train_loss_) train_summaries = tf.summary.merge([ tf.summary.merge_all('training'), ] + [ tf.summary.histogram(var.name, var) for var in net.get_variables(tf.GraphKeys.MOVING_AVERAGE_VARIABLES) ]) if args.run_validation: X_test, Y_test = reader.build_input( 'cifar10', os.path.join(args.data_dir, 'test_batch*'), 100, 'eval') # Define validation outputs val_net = net(X_test, is_training=False) val_logits_ = val_net['logits'] val_pred_ = val_net['y_'] val_truth_ = Y_test val_loss_ = modules.sparse_crossentropy( val_logits_, val_truth_, collections=['losses', 'validation']) val_acc_ = tf.reduce_mean( tf.cast( tf.equal(tf.cast(val_truth_, tf.int32), tf.cast(val_pred_, tf.int32)), tf.float32)) # Define and setup a training supervisor global_step = tf.Variable(0, name='global_step', trainable=False) sv = tf.train.Supervisor(logdir=args.save_path, is_chief=True, summary_op=None, save_summaries_secs=tps.save_summary_sec, save_model_secs=tps.save_model_sec, global_step=global_step) s = sv.prepare_or_wait_for_session(config=tf.ConfigProto()) # Main training loop step = s.run(global_step) if args.resume else 0 while not sv.should_stop(): if step < 40000: lr = 0.1 elif step < 60000: lr = 0.01 elif step < 80000: lr = 0.001 else: lr = 0.0001 # Run the training op _ = s.run(train_op_, feed_dict={lr_placeholder: lr}) # Evaluation of training and validation data if step % tps.steps_eval == 0: (train_loss, train_acc, train_pred, train_truth, t_sum) = s.run([ train_loss_, train_acc_, train_pred_, train_truth_, train_summaries ]) sv.summary_computed(s, t_sum, global_step=step) print("\nEval step= {:d}".format(step)) print("Train: Loss= {:.6f}; Acc {:.6f}".format( train_loss, train_acc)) # Evaluate all validation data if args.run_validation: all_loss = [] all_acc = [] for _ in range(50): (val_loss, val_pred, val_truth, val_acc) = s.run( [val_loss_, val_pred_, val_truth_, val_acc_]) all_loss.append(val_loss) all_acc.append(val_acc) mean_loss = np.mean(all_loss, axis=0) mean_acc = np.mean(all_acc, axis=0) sv.summary_computed(s, modules.scalar_summary( mean_loss, 'val/loss'), global_step=step) sv.summary_computed(s, modules.scalar_summary( mean_acc, 'val/acc'), global_step=step) print("Valid: Loss= {:.6f}; Acc {:.6f}".format( mean_loss, mean_acc)) # Stopping condition if step >= tps.max_steps and tps.max_steps > 0: print('Run %d steps of %d steps - stopping now' % (step, tps.max_steps)) break step += 1