def evaluate(hps): """Eval loop.""" images, labels = hwdb_input.build_input( FLAGS.dataset, FLAGS.eval_data_path, hps.batch_size, FLAGS.mode) model = resnet_model.ResNet(hps, images, labels, FLAGS.mode) model.build_graph() saver = tf.train.Saver() summary_writer = tf.summary.FileWriter(FLAGS.eval_dir) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth=True sess = tf.Session(config=config) tf.train.start_queue_runners(sess) best_precision = 0.0 while True: try: ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root) except tf.errors.OutOfRangeError as e: tf.logging.error('Cannot restore checkpoint: %s', e) continue if not (ckpt_state and ckpt_state.model_checkpoint_path): tf.logging.info('No model to eval yet at %s', FLAGS.log_root) time.sleep(60) continue tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path) saver.restore(sess, ckpt_state.model_checkpoint_path) total_prediction, correct_prediction = 0, 0 for iter in six.moves.range(FLAGS.eval_batch_count): (summaries, loss, predictions, truth, train_step) = sess.run( [model.summaries, model.cost, model.predictions, model.labels, model.global_step]) truth = np.argmax(truth, axis=1) predictions = np.argmax(predictions, axis=1) correct_prediction += np.sum(truth == predictions) total_prediction += predictions.shape[0] #tf.logging.info('iter: %d' % (iter)) precision = 1.0 * correct_prediction / total_prediction best_precision = max(precision, best_precision) precision_summ = tf.Summary() precision_summ.value.add( tag='Precision', simple_value=precision) summary_writer.add_summary(precision_summ, train_step) best_precision_summ = tf.Summary() best_precision_summ.value.add( tag='Best Precision', simple_value=best_precision) summary_writer.add_summary(best_precision_summ, train_step) summary_writer.add_summary(summaries, train_step) tf.logging.info('loss: %.3f, precision: %.3f, best precision: %.3f' % (loss, precision, best_precision)) summary_writer.flush() if FLAGS.eval_once: break time.sleep(60)
def evaluate(hps, num_iterations, dataset): total_acc = 0.0 print('Loading trained network, please wait......') # input data images, labels = resnet_input.input(dataset, hps.batch_size, 'eval') # resnet model model = resnet_model.ResNet(hps, images, labels, 'eval') model.build_graph() # run session coord = tf.train.Coordinator() config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8) config.gpu_options.allow_growth = True saver = tf.train.Saver() with tf.Session(config=config) as sess: queue_runner = tf.train.start_queue_runners(sess=sess, coord=coord) saver.restore(sess, './model/model.ckpt') for i in range(num_iterations): acc = sess.run(model.accuracy) total_acc += acc total_acc /= num_iterations print('Total accuracy on test set is %.2f' % total_acc) coord.request_stop() coord.join(queue_runner)
def __init__(self, data, eval_batch_count): hps = resnet_model.HParams(batch_size=100, num_classes=10, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='mom', num_gpus=0) data = ray.get(data) total_images = np.concatenate([data[0], data[1], data[2]]) with tf.Graph().as_default(): with tf.device('/cpu:0'): images, labels = cifar_input.build_input( [total_images, data[3]], hps.batch_size, False) self.model = resnet_model.ResNet(hps, images, labels, 'eval') self.model.build_graph() config = tf.ConfigProto(allow_soft_placement=True) sess = tf.Session(config=config) self.model.variables.set_session(sess) self.coord = tf.train.Coordinator() tf.train.start_queue_runners(sess, coord=self.coord) init = tf.global_variables_initializer() sess.run(init) self.best_precision = 0.0 self.eval_batch_count = eval_batch_count
def __init__(self, data, dataset, eval_batch_count, eval_dir): os.environ["CUDA_VISIBLE_DEVICES"] = "" hps = resnet_model.HParams( batch_size=100, num_classes=100 if dataset == "cifar100" else 10, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer="mom", num_gpus=0) with tf.device("/cpu:0"): # Builds the testing network. images, labels = cifar_input.build_input(data, hps.batch_size, dataset, False) self.model = resnet_model.ResNet(hps, images, labels, "eval") self.model.build_graph() config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) self.model.variables.set_session(sess) init = tf.global_variables_initializer() sess.run(init) # Initializing parameters for tensorboard. self.best_precision = 0.0 self.eval_batch_count = eval_batch_count self.summary_writer = tf.summary.FileWriter(eval_dir, sess.graph) # The IP address where tensorboard logs will be on. self.ip_addr = ray.services.get_node_ip_address()
def __init__(self, data, num_gpus): if num_gpus > 0: os.environ['CUDA_VISIBLE_DEVICES'] = ','.join( [str(i) for i in ray.get_gpu_ids()]) hps = resnet_model.HParams(batch_size=128, num_classes=10, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='mom', num_gpus=num_gpus) data = ray.get(data) total_images = np.concatenate([data[0], data[1], data[2]]) with tf.Graph().as_default(): if num_gpus > 0: tf.set_random_seed(ray.get_gpu_ids()[0] + 1) else: tf.set_random_seed(1) with tf.device('/gpu:0' if num_gpus > 0 else '/cpu:0'): images, labels = cifar_input.build_input( [total_images, data[3]], hps.batch_size, True) self.model = resnet_model.ResNet(hps, images, labels, 'train') self.model.build_graph() config = tf.ConfigProto(allow_soft_placement=True) sess = tf.Session(config=config) self.model.variables.set_session(sess) self.coord = tf.train.Coordinator() tf.train.start_queue_runners(sess, coord=self.coord) init = tf.global_variables_initializer() sess.run(init)
def evaluate(hps): """Eval loop.""" images, labels = cifar_input.build_input('cifar10', FLAGS.eval_data_path, hps.batch_size, 'eval') model = resnet_model.ResNet(hps, images, labels, 'eval') model.build_graph() sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) saver = tf.train.Saver() ################################## ## FIXME: Make a summary writer ## ################################## summary_writer = tf.summary.FileWriter(FLAGS.eval_dir) try: ckpt_state = tf.train.get_checkpoint_state(FLAGS.ckpt_dir) except tf.errors.OutOfRangeError as e: tf.logging.error('Cannot restore checkpoint: %s', e) if not (ckpt_state): tf.logging.info('No model to eval yet at %s', FLAGS.ckpt_dir) best_precision = 0. for i in range(len(ckpt_state.all_model_checkpoint_paths)): tf.logging.info('Loading checkpoint %s', ckpt_state.all_model_checkpoint_paths[i]) saver.restore(sess, ckpt_state.all_model_checkpoint_paths[i]) total_prediction, correct_prediction = 0, 0 for _ in six.moves.range(FLAGS.eval_batch_count): (summaries, loss, predictions, truth, train_step) = sess.run([ model.summaries, model.cost, model.predictions, model.labels, model.global_step ]) truth = np.argmax(truth, axis=1) predictions = np.argmax(predictions, axis=1) correct_prediction += np.sum(truth == predictions) total_prediction += predictions.shape[0] precision = 1.0 * correct_prediction / total_prediction best_precision = max(precision, best_precision) ######################################################## ## FIXME: Add summary of precision and best precision ## ######################################################## summ_precision = tf.Summary() summ_precision.value.add(tag='precision', simple_value=precision) summary_writer.add_summary(summ_precision, train_step) summ_best_precision = tf.Summary() summ_best_precision.value.add(tag='best_precision', simple_value=best_precision) summary_writer.add_summary(summ_best_precision, train_step) tf.logging.info('loss: %.3f, precision: %.3f, best precision: %.3f' % (loss, precision, best_precision)) summary_writer.flush()
def train(hps): """Training loop.""" images, labels = cifar_input.build_input( FLAGS.dataset, FLAGS.train_data_path, hps.batch_size, FLAGS.mode) model = resnet_model.ResNet(hps, images, labels, FLAGS.mode) model.build_graph() summary_writer = tf.train.SummaryWriter(FLAGS.train_dir) sv = tf.train.Supervisor(logdir=FLAGS.log_root, is_chief=True, summary_op=None, save_summaries_secs=60, save_model_secs=300, global_step=model.global_step) sess = sv.prepare_or_wait_for_session() step = 0 total_prediction = 0 correct_prediction = 0 precision = 0.0 lrn_rate = 0.1 while not sv.should_stop(): (_, summaries, loss, predictions, truth, train_step) = sess.run( [model.train_op, model.summaries, model.cost, model.predictions, model.labels, model.global_step], feed_dict={model.lrn_rate: lrn_rate}) if train_step < 40000: lrn_rate = 0.1 elif train_step < 60000: lrn_rate = 0.01 elif train_step < 80000: lrn_rate = 0.001 else: lrn_rate = 0.0001 predictions = np.argmax(predictions, axis=1) truth = np.argmax(truth, axis=1) for (t, p) in zip(truth, predictions): if t == p: correct_prediction += 1 total_prediction += 1 precision = float(correct_prediction) / total_prediction correct_prediction = total_prediction = 0 step += 1 if step % 100 == 0: precision_summ = tf.Summary() precision_summ.value.add( tag='Precision', simple_value=precision) summary_writer.add_summary(precision_summ, train_step) summary_writer.add_summary(summaries, train_step) tf.logging.info('loss: %.3f, precision: %.3f\n' % (loss, precision)) summary_writer.flush() sv.Stop()
def train(hps): trainset = input.ImageSet(FLAGS.train_data) images, labels, _ = trainset.next_batch(FLAGS.batch_size) # images, labels = cifar_input.build_input( # FLAGS.dataset, FLAGS.train_data_path, hps.batch_size, FLAGS.mode) model = resnet_model.ResNet(hps, images, labels, FLAGS.mode) model.build_graph() truth = model.labels predictions = tf.argmax(model.predictions, axis=1) precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth))) summary_hook = tf.train.SummarySaverHook( save_steps=10, output_dir=FLAGS.train_dir, summary_op=tf.summary.merge( [model.summaries, tf.summary.scalar('Precision', precision)])) logging_hook = tf.train.LoggingTensorHook( tensors={'step': model.global_step, 'loss': model.cost, 'precision': precision}, every_n_iter=10) class _LearningRateSetterHook(tf.train.SessionRunHook): def begin(self): self._lrn_rate = 0.1 def before_run(self, run_context): return tf.train.SessionRunArgs( model.global_step, feed_dict={model.lrn_rate: self._lrn_rate}) def after_run(self, run_context, run_values): train_step = run_values.results if train_step < 40000: self._lrn_rate = 0.1 elif train_step < 60000: self._lrn_rate = 0.01 elif train_step < 80000: self._lrn_rate = 0.001 else: self._lrn_rate = 0.0001 with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.log_root, hooks=[logging_hook, _LearningRateSetterHook()], chief_only_hooks=[summary_hook], save_summaries_steps=0, config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(model.train_op)
def getQfeature(filepath): image = read(filepath) labels = [3] hps = resnet_model.HParams(batch_size=1, num_classes=4, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='mom') model = resnet_model.ResNet(hps, image, labels, FLAGS.mode) model.build_graph() logits = tf.get_default_graph().get_tensor_by_name("logit/xw_plus_b:0") print(logits) logits_norm = tf.nn.l2_normalize(logits, 1) # Run our model steps = 1 # *** Maybe exist some duplicate image features, next dict op will clear it. # Restore the moving average version of the learned variables for better effect. # for name in variables_to_restore: # print(name) saver = tf.train.Saver() with tf.Session() as sess: # Restore model from checkpoint. # Note!: checkpoint file not a single file, so don't use like this: # saver.restore(sess, '/path/to/model.ckpt-1000.index') xxx # Don't forget launch queue, use coordinator to avoid harmless 'Enqueue operation was cancelled ERROR'(of course you can also just start) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # ckpt correspond to 'checkpoint' file. ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) # model_checkpoint_path looks something like: /path/to/model.ckpt-1000 print(ckpt.model_checkpoint_path) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) # fc1_list=fc2_list=fc3_list=[] # the same object! logits_list = [] _logits = sess.run([logits_norm]) # return nd-array print('................') print(_logits) print('................') put_2darray(_logits, logits_list) return logits_list
def train(hps, num_iterations, dataset): with tf.Graph().as_default(): # input data images, labels = resnet_input.input(dataset, hps.batch_size, 'train') # resnet model model = resnet_model.ResNet(hps, images, labels, 'train') model.build_graph() # summary hook merged_summary_op = tf.summary.merge_all() # run session coord = tf.train.Coordinator() config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8) config.gpu_options.allow_growth = True saver = tf.train.Saver() start_time = time.time() with tf.Session(config=config) as sess: # summaries train_writer = tf.summary.FileWriter('./train', graph=sess.graph) sess.run(tf.global_variables_initializer()) queue_runner = tf.train.start_queue_runners(sess=sess, coord=coord) # train for i in range(num_iterations): # learning rate decay if i < 32000: model.learning_rate = model.hps.init_lr elif i == 32000: model.learning_rate /= 10 elif i == 48000: model.learning_rate /= 10 _, acc, loss = sess.run( [model.train_op, model.accuracy, model.cross_entropy]) summary = sess.run(merged_summary_op) if i % 100 == 0: train_writer.add_summary(summary, i) print( 'iter %d, the loss is %.3f, accuracy on train set is %.2f' % (i, loss, acc)) if i % 1000 == 0: saver.save(sess, 'model/model.ckpt') print('learning rate -> %f' % model.learning_rate) coord.request_stop() coord.join(queue_runner) train_writer.close() stop_time = time.time() print('%d iterations takes %.2f seconds' % (num_iterations, stop_time - start_time))
def train(hps): """Training loop.""" images, labels = synthetic_data(hps.batch_size) model = resnet_model.ResNet(hps, images, labels, FLAGS.mode) model.build_graph() summary_writer = tf.train.SummaryWriter(FLAGS.train_dir) sv = tf.train.Supervisor(logdir=FLAGS.log_root, is_chief=True, summary_op=None, save_summaries_secs=60, save_model_secs=300, global_step=model.global_step) sess = sv.prepare_or_wait_for_session(config=tf.ConfigProto( allow_soft_placement=True)) step = 0 lrn_rate = 0.1 while not sv.should_stop(): (_, summaries, loss, predictions, truth, train_step) = sess.run([ model.train_op, model.summaries, model.cost, model.predictions, model.labels, model.global_step ], feed_dict={model.lrn_rate: lrn_rate}) if train_step < 40000: lrn_rate = 0.1 elif train_step < 60000: lrn_rate = 0.01 elif train_step < 80000: lrn_rate = 0.001 else: lrn_rate = 0.0001 truth = np.argmax(truth, axis=1) predictions = np.argmax(predictions, axis=1) precision = np.mean(truth == predictions) step += 1 if step % 100 == 0: precision_summ = tf.Summary() precision_summ.value.add(tag='Precision', simple_value=precision) summary_writer.add_summary(precision_summ, train_step) summary_writer.add_summary(summaries, train_step) tf.logging.info('loss: %.3f, precision: %.3f\n' % (loss, precision)) summary_writer.flush() sv.Stop()
def gene_prob(hps): """Generating loop.""" images, labels, prob = cifar_input_v2.build_input( FLAGS.dataset, FLAGS.train_data_path, hps.batch_size, 'eval') model = resnet_model.ResNet(hps, images, labels, prob, 'eval') model.build_graph() saver = tf.train.Saver() sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) tf.train.start_queue_runners(sess) while True: try: ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root_expert) except tf.errors.OutOfRangeError as e: tf.logging.error('Cannot restore checkpoint: %s', e) continue if not (ckpt_state and ckpt_state.model_checkpoint_path): tf.logging.info('No model to eval yet at %s', FLAGS.log_root_expert) continue tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path) saver.restore(sess, ckpt_state.model_checkpoint_path) obj_ = [] i = 1 count = 0 f = open('prob.pkl','wb') for _ in six.moves.range(40000/hps.batch_size): (predictions, truth) = sess.run( [model.predictions, model.labels]) pred_probability = np.sum(truth*predictions,axis=1) #with shape [128] if pred_probability[0] >= 0.90: count = count + 1 obj_.append(pred_probability) if i%100 == 0: print i,'----->',pred_probability i = i + 1 print 'the ratio is:', 1.0*count/40000 cPickle.dump(obj=obj_, file=f, protocol=0) f.close() #create bin file """ file = open('prob.pkl','rb') data = cPickle.load(file) arr = np.array(data) arr.tofile("prob_pkl.bin") """ break
def train(hps): """Training loop.""" single_gpu_graph = tf.Graph() with single_gpu_graph.as_default(): images, labels = cifar_input.build_input('cifar10', FLAGS.train_data_path, hps.batch_size, 'train') model = resnet_model.ResNet(hps, images, labels, 'train') model.build_graph() truth = tf.argmax(model.labels, axis=1) predictions = tf.argmax(model.predictions, axis=1) precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth))) ######################################################################## #### FIXME: Get session for distributed environments using Parallax #### #### Pass parallax_config as an argument #### ######################################################################## parallax_sess, num_workers, worker_id, num_replicas_per_worker = \ parallax.parallel_run(single_gpu_graph, FLAGS.resource_info_file, sync=FLAGS.sync, parallax_config=parallax_config.build_config()) for i in range(350000): _, global_step, cost, precision_ = \ parallax_sess.run([model.train_op, model.global_step, model.cost, precision]) if i % 10 == 0: print('step: %d, loss: %.3f, precision: %.3f' % (global_step[0], cost[0], precision_[0])) # Tuning learning rate train_step = global_step[0] if train_step < 10000: lrn_rate = 0.1 elif train_step < 15000: lrn_rate = 0.01 elif train_step < 20000: lrn_rate = 0.001 else: lrn_rate = 0.0001 feed_dict = {model.lrn_rate: []} for worker in range(num_replicas_per_worker): feed_dict[model.lrn_rate].append(lrn_rate) parallax_sess.run(model.global_step, feed_dict=feed_dict)
def test(hps): images, labels = resnet_model.inputs(FLAGS.eval_data_path, FLAGS.eval_batch_size, eval_data=True) model = resnet_model.ResNet(hps, images, labels, 'eval') model.build_graph() saver = tf.train.Saver(tf.global_variables()) summary_writer = tf.summary.FileWriter(FLAGS.eval_dir) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # while True: # try: # ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root) # except tf.errors.OutOfRangeError as e: # tf.logging.error('Cannot restore checkpoint: %s', e) # continue # if not (ckpt_state and ckpt_state.model_checkpoint_path): # tf.logging.info('No model to eval yet at %s', FLAGS.log_root) # continue ckpt = tf.train.get_checkpoint_state(FLAGS.log_root) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) tf.logging.info('Loading checkpoint %s', ckpt.model_checkpoint_path) else: tf.logging.info('No model to eval yet at %s', FLAGS.log_root) return tf.train.start_queue_runners(sess) test_acc, test_loss = 0, 0 print('start') for _ in range(FLAGS.eval_batch_count): (summaries, loss, predictions, acc, train_step) = sess.run([ model.summaries, model.costs, model.labels, model.acc, model.global_step ]) test_acc += acc test_loss += loss print(acc) precision = 1.0 * test_acc / FLAGS.eval_batch_count total_loss = 1.0 * test_loss / FLAGS.eval_batch_count precision_summ = tf.Summary() precision_summ.value.add(tag='Precision', simple_value=precision) summary_writer.add_summary(precision_summ, train_step) summary_writer.add_summary(summaries, train_step) tf.logging.info('loss: %.3f, precision: %.3f' % (total_loss, precision)) summary_writer.flush()
def train(hps): # the function calls tf.contrib.framework.get_or_create_global_step() and also build_model which builds the resnet graph model = resnet_model.ResNet(hps, train_features, train_labels_1, '') model._build_model() trainable_variables = tf.trainable_variables() grads = tf.gradients(model.cost, trainable_variables) apply_op = optimizer.apply_gradients(zip(grads, trainable_variables), global_step=global_step, name='train_step') train_ops = [apply_op] model.train_op = tf.group(*train_ops) model.summaries = tf.summary.merge_all() param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS) truth = tf.argmax(model.labels, axis=1) predictions = tf.argmax(model.predictions, axis=1) precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth))) summary_hook = tf.train.SummarySaverHook( save_steps=100, output_dir=FLAGS.train_dir, summary_op=tf.summary.merge( [model.summaries, tf.summary.scalar('Precision', precision)])) logging_hook = tf.train.LoggingTensorHook(tensors={ 'step': global_step, 'loss': model.cost, 'precision': precision }, every_n_iter=100) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.log_root, hooks=[logging_hook], chief_only_hooks=[summary_hook], save_summaries_steps=0, #disable the default summary config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(model.train_op)
def __init__(self, data, dataset, num_gpus): if num_gpus > 0: os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( [str(i) for i in ray.get_gpu_ids()]) hps = resnet_model.HParams( batch_size=128, num_classes=100 if dataset == "cifar100" else 10, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer="mom", num_gpus=num_gpus) # We seed each actor differently so that each actor operates on a # different subset of data. if num_gpus > 0: tf.set_random_seed(ray.get_gpu_ids()[0] + 1) else: # Only a single actor in this case. tf.set_random_seed(1) input_images = data[0] input_labels = data[1] with tf.device("/gpu:0" if num_gpus > 0 else "/cpu:0"): # Build the model. images, labels = cifar_input.build_input([input_images, input_labels], hps.batch_size, dataset, False) self.model = resnet_model.ResNet(hps, images, labels, "train") self.model.build_graph() config = tf.ConfigProto(allow_soft_placement=True) sess = tf.Session(config=config) self.model.variables.set_session(sess) self.coord = tf.train.Coordinator() tf.train.start_queue_runners(sess, coord=self.coord) init = tf.global_variables_initializer() sess.run(init) self.steps = 10
def _my_model_fn(features, labels, mode, params): del params # unused, but needed for TPU training # # Model - Here we use pre-built 'resnet_model' # model_params = resnet_model.HParams( batch_size=int( batch_size / FLAGS.num_replica), # because batch is divided by TPU replicas num_classes=10, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, # 5 x (3 x sub 2) + 2 = 32 layers use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='mom') train_model = resnet_model.ResNet(model_params, features, labels, 'train') train_model.build_graph(tpu_opt=True) # create evaluation metrices #truth = tf.argmax(train_model.labels, axis=1) #predictions = tf.argmax(train_model.predictions, axis=1) #precision = tf.reduce_mean( # tf.to_float(tf.equal(predictions, truth)), # name="precision") #accuracy = tf.metrics.accuracy(truth, predictions) #tf.summary.scalar('accuracy', accuracy[1]) # output to TensorBoard # define operations (Here we assume only training operation !) #prediction_outputs = { # "precision": precision #} return tpu_estimator.TPUEstimatorSpec( mode=mode, loss=train_model.cost, train_op=train_model.train_op, #predictions=prediction_outputs, eval_metrics=(metric_fn, [train_model.labels, train_model.predictions]))
def infer(hps, X_infer, y_infer): """Infering process Args: hps: Hyperparameters. X_infer: Pathes of images. A 1-D numpy array of shape (N_infer, ). y_infer: Labels. A 1-D numpy array of shape (N_infer, ). Note that there is no labels when infering, so 'y_infer' is never used in infering process, but working as a placeholder because of the requirement that labels must be provided to build model. """ images, labels = cifar_input.build_infer_input(X_infer, y_infer, hps) model = resnet_model.ResNet(hps, images, labels, FLAGS.mode) model.build_graph() saver = tf.train.Saver() sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) tf.train.start_queue_runners(sess) try: ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root) except tf.errors.OutOfRangeError as e: tf.logging.error('Cannot restore checkpoint: %s', e) if not (ckpt_state and ckpt_state.model_checkpoint_path): tf.logging.info('No model to eval yet at %s', FLAGS.log_root) tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path) saver.restore(sess, ckpt_state.model_checkpoint_path) for _ in six.moves.range(FLAGS.infer_batch_count): predictions = sess.run(model.predictions) predictions = np.argmax(predictions, axis=1) # Store the prediction into a .txt file with open('./predict.txt', 'a') as f: for item in predictions.tolist(): f.write(str(item) + '\n')
def __init__(self, loader, args=None, curv=False): class Args(): num_classes = 10 resnet_width = 1 num_resunits = 3 nohess = False randvec = False poison = False n_grads_spec = 1 batch_size = 128 specreg_bn = False normalizer = 'filtnorm' bin_path = '/root/bin' weight_decay = 0.0 self.args = Args() if args == None else args if curv: self.args.randvec = True self.home = os.environ['HOME'] # model and data loader self.model = resnet_model.ResNet(self.args, mode='eval' if not curv else 'curv') self.loader = loader # session self.sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions( allow_growth=True))) self.sess.run(tf.global_variables_initializer()) # build assign op to prevent memory leak from creating the node on each iteration self.inputweights = [ tf.zeros_like(t) for t in tf.trainable_variables() ] self.assignop = [ tf.assign(t, w) for t, w in zip(tf.trainable_variables(), self.inputweights) ]
def get_model_config(model): """Map model name to model network configuration.""" if model == 'deep_mnist': mc = deepmnist_model.DeepMNISTModel() elif model == 'eng_acoustic_model': mc = engacoustic_model.EngAcousticModel() elif model == 'sensor_net': mc = sensornet_model.SensorNetModel() elif model == 'vgg11': mc = vgg_model.Vgg11Model() elif model == 'vgg13': mc = vgg_model.Vgg13Model() elif model == 'vgg16': mc = vgg_model.Vgg16Model() elif model == 'vgg19': mc = vgg_model.Vgg19Model() elif model == 'lenet': mc = lenet_model.Lenet5Model() elif model == 'googlenet': mc = googlenet_model.GooglenetModel() elif model == 'overfeat': mc = overfeat_model.OverfeatModel() elif model == 'alexnet': mc = alexnet_model.AlexnetModel() elif model == 'trivial': mc = trivial_model.TrivialModel() elif model == 'inception3': mc = inception_model.Inceptionv3Model() elif model == 'inception4': mc = inception_model.Inceptionv4Model() elif model in ('resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'resnet200', 'resnet269'): mc = resnet_model.ResNet(model) else: raise KeyError('Invalid model name \'%s\'' % model) return mc
def get_model(hps, dataset, train_data_path, mode='train'): images, labels = cifar_input.build_input(dataset, train_data_path, hps.batch_size, mode) model = resnet_model.ResNet(hps, images, labels, mode) model.build_graph() return model
def train_resnet_mentornet(max_step_run): """Trains the mentornet with the student resnet model. Args: max_step_run: The maximum number of gradient steps. """ if not os.path.exists(FLAGS.train_log_dir): os.makedirs(FLAGS.train_log_dir) g = tf.Graph() with g.as_default(): with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)): tf_global_step = tf.train.get_or_create_global_step() # pylint: disable=line-too-long images, one_hot_labels, clean_images, clean_one_hot_labels, num_samples_per_epoch, num_of_classes = cifar_data_provider.my_provide_resnet_data( FLAGS.dataset_name, 'train', FLAGS.batch_size, dataset_dir=FLAGS.data_dir) hps = resnet_model.HParams( batch_size=FLAGS.batch_size, num_classes=num_of_classes, min_lrn_rate=0.0001, lrn_rate=FLAGS.learning_rate, num_residual_units=9, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='mom') images.set_shape([FLAGS.batch_size, 32, 32, 3]) tf.logging.info('num_of_example=%s', num_samples_per_epoch) # Define the model: resnet = resnet_model.ResNet(hps, images, one_hot_labels, mode='train') logits = resnet.build_model() # Specify the loss function: loss = tf.nn.softmax_cross_entropy_with_logits( labels=one_hot_labels, logits=logits) dropout_rates = utils.parse_dropout_rate_list(FLAGS.example_dropout_rates) example_dropout_rates = tf.convert_to_tensor( dropout_rates, np.float32, name='example_dropout_rates') loss_p_percentile = tf.convert_to_tensor( np.array([FLAGS.loss_p_percentile] * 100), np.float32, name='loss_p_percentile') loss = tf.reshape(loss, [-1, 1]) epoch_step = tf.to_int32( tf.floor(tf.divide(tf_global_step, max_step_run) * 100)) zero_labels = tf.zeros([tf.shape(loss)[0], 1], tf.float32) v = utils.mentornet( epoch_step, loss, zero_labels, loss_p_percentile, example_dropout_rates, burn_in_epoch=FLAGS.burn_in_epoch, fixed_epoch_after_burn_in=FLAGS.fixed_epoch_after_burn_in, loss_moving_average_decay=FLAGS.loss_moving_average_decay) tf.stop_gradient(v) # Split v into clean data & noise data part is_clean = tf.reshape(tf.reduce_all(tf.equal(one_hot_labels, clean_one_hot_labels), axis=1), [-1,1]) clean_v = tf.boolean_mask(v, is_clean) noise_v = tf.boolean_mask(v, ~is_clean) tf.add_to_collection('v', v) tf.add_to_collection('v', clean_v) tf.add_to_collection('v', noise_v) slim.summaries.add_histogram_summary(tf.boolean_mask(v, is_clean), 'clean_v') slim.summaries.add_histogram_summary(tf.boolean_mask(v, ~is_clean), 'noisy_v') # Log data utilization data_util = utils.summarize_data_utilization(v, tf_global_step, FLAGS.batch_size) decay_loss = resnet.decay() weighted_loss_vector = tf.multiply(loss, v) weighted_loss = tf.reduce_mean(weighted_loss_vector) slim.summaries.add_scalar_summary( tf.reduce_mean(loss), 'mentornet/orig_loss') slim.summaries.add_scalar_summary(weighted_loss, 'mentornet/weighted_loss') # Normalize the decay loss based on v weighed_decay_loss = decay_loss * (tf.reduce_sum(v) / FLAGS.batch_size) weighted_total_loss = weighted_loss + weighed_decay_loss slim.summaries.add_scalar_summary(weighted_total_loss, 'mentornet/total_loss') slim.summaries.add_scalar_summary(weighted_total_loss, 'total_loss') tf.add_to_collection('total_loss', weighted_total_loss) boundaries = [19531, 25000, 30000] values = [FLAGS.learning_rate * t for t in [1, 0.1, 0.01, 0.001]] lr = tf.train.piecewise_constant(tf_global_step, boundaries, values) slim.summaries.add_scalar_summary(lr, 'learning_rate') # Specify the optimization scheme: with tf.control_dependencies([weighted_total_loss, data_util]): # Set up training. trainable_variables = tf.trainable_variables() trainable_variables = tf.contrib.framework.filter_variables( trainable_variables, exclude_patterns=['mentornet']) grads = tf.gradients(weighted_total_loss, trainable_variables) optimizer = tf.train.MomentumOptimizer(lr, momentum=0.9) apply_op = optimizer.apply_gradients( zip(grads, trainable_variables), global_step=tf_global_step, name='train_step') train_ops = [apply_op] + resnet.extra_train_ops train_op = tf.group(*train_ops) # Parameter restore setup if FLAGS.trained_mentornet_dir is not None: ckpt_model = FLAGS.trained_mentornet_dir if os.path.isdir(FLAGS.trained_mentornet_dir): ckpt_model = tf.train.latest_checkpoint(ckpt_model) # Fix the mentornet parameters variables_to_restore = slim.get_variables_to_restore( # TODO(lujiang): mentornet_inputs or mentor_inputs? include=['mentornet', 'mentornet_inputs']) iassign_op1, ifeed_dict1 = tf.contrib.framework.assign_from_checkpoint( ckpt_model, variables_to_restore) # Create an initial assignment function. def init_assign_fn(sess): tf.logging.info('Restore using customer initializer %s', '.' * 10) sess.run(iassign_op1, ifeed_dict1) else: init_assign_fn = None tf.logging.info('-' * 20 + 'MentorNet' + '-' * 20) tf.logging.info('loaded pretrained mentornet from %s', ckpt_model) tf.logging.info('loss_p_percentile=%3f', FLAGS.loss_p_percentile) tf.logging.info('burn_in_epoch=%d', FLAGS.burn_in_epoch) tf.logging.info('fixed_epoch_after_burn_in=%s', FLAGS.fixed_epoch_after_burn_in) tf.logging.info('loss_moving_average_decay=%3f', FLAGS.loss_moving_average_decay) tf.logging.info('example_dropout_rates %s', ','.join( str(t) for t in dropout_rates)) tf.logging.info('-' * 20) saver = tf.train.Saver(max_to_keep=10, keep_checkpoint_every_n_hours=24) # Run training. slim.learning.train( train_op=train_op, train_step_fn=resnet_train_step, logdir=FLAGS.train_log_dir, master=FLAGS.master, is_chief=FLAGS.task == 0, saver=saver, number_of_steps=max_step_run, init_fn=init_assign_fn, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs)
def train(hps): # 构建输入数据(读取队列执行器) images, labels = cifar_input.build_input( FLAGS.dataset, FLAGS.train_data_path, hps.batch_size, FLAGS.mode) # 构建残差网络模型 model = resnet_model.ResNet(hps, images, labels, FLAGS.mode) model.build_graph() # 计算预测准确率 truth = tf.argmax(model.labels, axis=1) predictions = tf.argmax(model.predictions, axis=1) precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth))) # 建立总结存储器,每100步存储一次 summary_hook = tf.train.SummarySaverHook( save_steps=100, output_dir=FLAGS.train_dir, summary_op=tf.summary.merge( [model.summaries, tf.summary.scalar('Precision', precision)])) # 建立日志打印器,每100步打印一次 logging_hook = tf.train.LoggingTensorHook( tensors={'step': model.global_step, 'loss': model.cost, 'precision': precision}, every_n_iter=100) # 学习率更新器,基于全局Step class _LearningRateSetterHook(tf.train.SessionRunHook): def begin(self): #初始学习率 self._lrn_rate = 0.1 def before_run(self, run_context): return tf.train.SessionRunArgs( # 获取全局Step model.global_step, # 设置学习率 feed_dict={model.lrn_rate: self._lrn_rate}) def after_run(self, run_context, run_values): # 动态更新学习率 train_step = run_values.results if train_step < 40000: self._lrn_rate = 0.1 elif train_step < 60000: self._lrn_rate = 0.01 elif train_step < 80000: self._lrn_rate = 0.001 else: self._lrn_rate = 0.0001 # 建立监控Session with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.log_root, hooks=[logging_hook, _LearningRateSetterHook()], chief_only_hooks=[summary_hook], # 禁用默认的SummarySaverHook,save_summaries_steps设置为0 save_summaries_steps=0, config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess: while not mon_sess.should_stop(): # 执行优化训练操作 mon_sess.run(model.train_op)
def evaluate(hps): # 构建输入数据(读取队列执行器) images, labels = cifar_input.build_input( FLAGS.dataset, FLAGS.eval_data_path, hps.batch_size, FLAGS.mode) # 构建残差网络模型 model = resnet_model.ResNet(hps, images, labels, FLAGS.mode) model.build_graph() # 模型变量存储器 saver = tf.train.Saver() # 总结文件 生成器 summary_writer = tf.summary.FileWriter(FLAGS.eval_dir) # 执行Session sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # 启动所有队列执行器 tf.train.start_queue_runners(sess) best_precision = 0.0 while True: # 检查checkpoint文件 try: ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root) except tf.errors.OutOfRangeError as e: tf.logging.error('Cannot restore checkpoint: %s', e) continue if not (ckpt_state and ckpt_state.model_checkpoint_path): tf.logging.info('No model to eval yet at %s', FLAGS.log_root) continue # 读取模型数据(训练期间生成) tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path) saver.restore(sess, ckpt_state.model_checkpoint_path) # 逐Batch执行测试 total_prediction, correct_prediction = 0, 0 for _ in six.moves.range(FLAGS.eval_batch_count): # 执行预测 (loss, predictions, truth, train_step) = sess.run( [model.cost, model.predictions, model.labels, model.global_step]) # 计算预测结果 truth = np.argmax(truth, axis=1) predictions = np.argmax(predictions, axis=1) correct_prediction += np.sum(truth == predictions) total_prediction += predictions.shape[0] # 计算准确率 precision = 1.0 * correct_prediction / total_prediction best_precision = max(precision, best_precision) # 添加准确率总结 precision_summ = tf.Summary() precision_summ.value.add( tag='Precision', simple_value=precision) summary_writer.add_summary(precision_summ, train_step) # 添加最佳准确总结 best_precision_summ = tf.Summary() best_precision_summ.value.add( tag='Best Precision', simple_value=best_precision) summary_writer.add_summary(best_precision_summ, train_step) # 添加测试总结 #summary_writer.add_summary(summaries, train_step) # 打印日志 tf.logging.info('loss: %.3f, precision: %.3f, best precision: %.3f' % (loss, precision, best_precision)) # 执行写文件 summary_writer.flush() if FLAGS.eval_once: break time.sleep(60)
def train(hps): """Training loop.""" images, labels = cifar_input.build_input(FLAGS.dataset, FLAGS.train_data_path, hps.batch_size, FLAGS.mode, hps.data_format) model = resnet_model.ResNet(hps, images, labels, FLAGS.mode) model.build_graph() param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=tf.contrib.tfprof.model_analyzer. TRAINABLE_VARS_PARAMS_STAT_OPTIONS) sys.stdout.write('total_params: %d\n' % param_stats.total_parameters) tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS) truth = tf.argmax(model.labels, axis=1) predictions = tf.argmax(model.predictions, axis=1) precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth))) summary_hook = tf.train.SummarySaverHook( save_steps=100, output_dir=FLAGS.train_dir, summary_op=tf.summary.merge( [model.summaries, tf.summary.scalar('Precision', precision)])) num_steps_per_epoch = 391 # TODO: Don't hardcode this. logging_hook = tf.train.LoggingTensorHook(tensors={ 'step': model.global_step, 'loss': model.cost, 'precision': precision }, every_n_iter=100) class _LearningRateSetterHook(tf.train.SessionRunHook): """Sets learning_rate based on global step.""" def begin(self): self._lrn_rate = 0.01 def before_run(self, run_context): return tf.train.SessionRunArgs( model.global_step, # Asks for global step value. feed_dict={model.lrn_rate: self._lrn_rate}) # Sets learning rate def after_run(self, run_context, run_values): train_step = run_values.results if train_step < num_steps_per_epoch: self._lrn_rate = 0.01 elif train_step < (91 * num_steps_per_epoch): self._lrn_rate = 0.1 elif train_step < (136 * num_steps_per_epoch): self._lrn_rate = 0.01 elif train_step < (181 * num_steps_per_epoch): self._lrn_rate = 0.001 else: self._lrn_rate = 0.0001 class _SaverHook(tf.train.SessionRunHook): """Sets learning_rate based on global step.""" def begin(self): self.saver = tf.train.Saver(max_to_keep=10000) subprocess.call("rm -rf %s; mkdir -p %s" % (FLAGS.checkpoint_dir, FLAGS.checkpoint_dir), shell=True) self.f = open(os.path.join(FLAGS.checkpoint_dir, "times.log"), 'w') def after_create_session(self, sess, coord): self.sess = sess self.start_time = time.time() def before_run(self, run_context): return tf.train.SessionRunArgs( model.global_step # Asks for global step value. ) def after_run(self, run_context, run_values): train_step = run_values.results epoch = train_step / num_steps_per_epoch if train_step % num_steps_per_epoch == 0: end_time = time.time() directory = os.path.join(FLAGS.checkpoint_dir, ("%5d" % epoch).replace(' ', '0')) subprocess.call("mkdir -p %s" % directory, shell=True) ckpt_name = 'model.ckpt' self.saver.save(self.sess, os.path.join(directory, ckpt_name), global_step=train_step) self.f.write("Step: %d\tTime: %s\n" % (train_step, end_time - self.start_time)) print("Saved checkpoint after %d epoch(s) to %s..." % (epoch, directory)) sys.stdout.flush() self.start_time = time.time() def end(self, sess): self.f.close() with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.log_root, hooks=[logging_hook, _LearningRateSetterHook()], chief_only_hooks=[summary_hook, _SaverHook()], save_checkpoint_secs=None, # Since we provide a SummarySaverHook, we need to disable default # SummarySaverHook. To do that we set save_summaries_steps to 0. save_summaries_steps=None, save_summaries_secs=None, config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess: for i in range(num_steps_per_epoch * 181): mon_sess.run(model.train_op)
def evaluate(hps): """Eval loop.""" images, labels = cifar_input.build_input(FLAGS.dataset, FLAGS.eval_data_path, hps.batch_size, FLAGS.mode, hps.data_format) model = resnet_model.ResNet(hps, images, labels, FLAGS.mode) model.build_graph() saver = tf.train.Saver() summary_writer = tf.summary.FileWriter(FLAGS.eval_dir) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) tf.train.start_queue_runners(sess) best_precision = 0.0 while True: try: ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root) except tf.errors.OutOfRangeError as e: tf.logging.error('Cannot restore checkpoint: %s', e) continue if not (ckpt_state and ckpt_state.model_checkpoint_path): tf.logging.info('No model to eval yet at %s', FLAGS.log_root) break tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path) saver.restore(sess, ckpt_state.model_checkpoint_path) global_step = ckpt_state.model_checkpoint_path.split('/')[-1].split( '-')[-1] if not global_step.isdigit(): global_step = 0 else: global_step = int(global_step) total_prediction, correct_prediction, correct_prediction_top5 = 0, 0, 0 start_time = time.time() for _ in six.moves.range(FLAGS.eval_batch_count): (summaries, loss, predictions, truth, train_step) = sess.run([ model.summaries, model.cost, model.predictions, model.labels, model.global_step ]) if not FLAGS.time_inference: for (indiv_truth, indiv_prediction) in zip(truth, predictions): indiv_truth = np.argmax(indiv_truth) top5_prediction = np.argsort(indiv_prediction)[-5:] top1_prediction = np.argsort(indiv_prediction)[-1] correct_prediction += (indiv_truth == top1_prediction) if indiv_truth in top5_prediction: correct_prediction_top5 += 1 total_prediction += 1 if FLAGS.time_inference: print("Time for inference: %.4f" % (time.time() - start_time)) else: precision = 1.0 * correct_prediction / total_prediction precision_top5 = 1.0 * correct_prediction_top5 / total_prediction best_precision = max(precision, best_precision) precision_summ = tf.Summary() precision_summ.value.add(tag='Precision', simple_value=precision) summary_writer.add_summary(precision_summ, train_step) best_precision_summ = tf.Summary() best_precision_summ.value.add(tag='Best Precision', simple_value=best_precision) summary_writer.add_summary(best_precision_summ, train_step) summary_writer.add_summary(summaries, train_step) print('Precision @ 1 = %.4f, Recall @ 5 = %.4f, Global step = %d' % (precision, precision_top5, global_step)) summary_writer.flush() if FLAGS.eval_once: break time.sleep(60)
def train_resnet_baseline(max_step_run): """Trains the resnet baseline model. Args: max_step_run: The maximum number of gradient steps. """ if not os.path.exists(FLAGS.train_log_dir): os.makedirs(FLAGS.train_log_dir) g = tf.Graph() with g.as_default(): with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)): tf_global_step = tf.train.get_or_create_global_step() # pylint: disable=line-too-long images, one_hot_labels, num_samples_per_epoch, num_of_classes = cifar_data_provider.provide_resnet_data( FLAGS.dataset_name, 'train', FLAGS.batch_size, dataset_dir=FLAGS.data_dir) hps = resnet_model.HParams(batch_size=FLAGS.batch_size, num_classes=num_of_classes, min_lrn_rate=0.0001, lrn_rate=FLAGS.learning_rate, num_residual_units=9, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='mom') images.set_shape([FLAGS.batch_size, 32, 32, 3]) tf.logging.info('num_of_example={}'.format(num_samples_per_epoch)) # Define the model: resnet = resnet_model.ResNet(hps, images, one_hot_labels, mode='train') logits = resnet.build_model() # Specify the loss function: total_loss = tf.nn.softmax_cross_entropy_with_logits( labels=one_hot_labels, logits=logits) total_loss = tf.reduce_mean(total_loss, name='xent') total_loss += resnet.decay() # decay tf.add_to_collection('total_loss', total_loss) decay_steps = int(num_samples_per_epoch / FLAGS.batch_size * FLAGS.num_epochs_per_decay) boundaries = [19531, 25000, 30000] values = [FLAGS.learning_rate * t for t in [1, 0.1, 0.01, 0.001]] lr = tf.train.piecewise_constant(tf_global_step, boundaries, values) slim.summaries.add_scalar_summary(lr, 'learning_rate', print_summary=True) lr = tf.train.exponential_decay(FLAGS.learning_rate, tf_global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) slim.summaries.add_scalar_summary(total_loss, 'total_loss', print_summary=True) # Set up training. trainable_variables = tf.trainable_variables() grads = tf.gradients(total_loss, trainable_variables) optimizer = tf.train.MomentumOptimizer(lr, momentum=0.9) apply_op = optimizer.apply_gradients(zip(grads, trainable_variables), global_step=tf_global_step, name='train_step') train_ops = [apply_op] + resnet.extra_train_ops train_op = tf.group(*train_ops) saver = tf.train.Saver(max_to_keep=10, keep_checkpoint_every_n_hours=24) # Run training. slim.learning.train(train_op=train_op, train_step_fn=resnet_train_step, logdir=FLAGS.train_log_dir, master=FLAGS.master, saver=saver, is_chief=FLAGS.task == 0, number_of_steps=max_step_run, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs)
def train_resnet_mentormix(max_step_run): """Trains the mentornet with the student resnet model. Args: max_step_run: The maximum number of gradient steps. """ if not os.path.exists(FLAGS.train_log_dir): os.makedirs(FLAGS.train_log_dir) g = tf.Graph() with g.as_default(): with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)): tf_global_step = tf.train.get_or_create_global_step() (images, one_hot_labels, num_samples_per_epoch, num_of_classes) = cifar_data_provider.provide_resnet_data( FLAGS.dataset_name, 'train', FLAGS.batch_size, dataset_dir=FLAGS.data_dir) hps = resnet_model.HParams(batch_size=FLAGS.batch_size, num_classes=num_of_classes, min_lrn_rate=0.0001, lrn_rate=FLAGS.learning_rate, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='mom') images.set_shape([FLAGS.batch_size, 32, 32, 3]) # Define the model: resnet = resnet_model.ResNet(hps, images, one_hot_labels, mode='train') with tf.variable_scope('ResNet32'): logits = resnet.build_model() # Specify the loss function: loss = tf.nn.softmax_cross_entropy_with_logits( labels=one_hot_labels, logits=logits) dropout_rates = utils.parse_dropout_rate_list( FLAGS.example_dropout_rates) example_dropout_rates = tf.convert_to_tensor( dropout_rates, np.float32, name='example_dropout_rates') loss_p_percentile = tf.convert_to_tensor(np.array( [FLAGS.loss_p_percentile] * 100), np.float32, name='loss_p_percentile') loss = tf.reshape(loss, [-1, 1]) epoch_step = tf.to_int32( tf.floor(tf.divide(tf_global_step, max_step_run) * 100)) zero_labels = tf.zeros([tf.shape(loss)[0], 1], tf.float32) mentornet_net_hparams = utils.get_mentornet_network_hyperparameter( FLAGS.trained_mentornet_dir) # In the simplest case, this function can be replaced with a thresholding # function. See loss_thresholding_function in utils.py. v = utils.mentornet(epoch_step, loss, zero_labels, loss_p_percentile, example_dropout_rates, burn_in_epoch=FLAGS.burn_in_epoch, mentornet_net_hparams=mentornet_net_hparams, avg_name='individual') v = tf.stop_gradient(v) loss = tf.stop_gradient(tf.identity(loss)) logits = tf.stop_gradient(tf.identity(logits)) # Perform MentorMix images_mix, labels_mix = utils.mentor_mix_up( images, one_hot_labels, v, FLAGS.mixup_alpha) resnet = resnet_model.ResNet(hps, images_mix, labels_mix, mode='train') with tf.variable_scope('ResNet32', reuse=True): logits_mix = resnet.build_model() loss = tf.nn.softmax_cross_entropy_with_logits(labels=labels_mix, logits=logits_mix) decay_loss = resnet.decay() # second weighting if FLAGS.second_reweight: loss = tf.reshape(loss, [-1, 1]) v = utils.mentornet( epoch_step, loss, zero_labels, loss_p_percentile, example_dropout_rates, burn_in_epoch=FLAGS.burn_in_epoch, mentornet_net_hparams=mentornet_net_hparams, avg_name='mixed') v = tf.stop_gradient(v) weighted_loss_vector = tf.multiply(loss, v) loss = tf.reduce_mean(weighted_loss_vector) # reproduced with the following decay loss which should be 0. decay_loss = tf.losses.get_regularization_loss() decay_loss = decay_loss * (tf.reduce_sum(v) / FLAGS.batch_size) # Log data utilization data_util = utils.summarize_data_utilization( v, tf_global_step, FLAGS.batch_size) loss = tf.reduce_mean(loss) slim.summaries.add_scalar_summary(tf.reduce_mean(loss), 'mentormix/mix_loss') weighted_total_loss = loss + decay_loss slim.summaries.add_scalar_summary(weighted_total_loss, 'total_loss') tf.add_to_collection('total_loss', weighted_total_loss) # Set up the moving averages: moving_average_variables = tf.trainable_variables() moving_average_variables = tf.contrib.framework.filter_variables( moving_average_variables, exclude_patterns=['mentornet']) variable_averages = tf.train.ExponentialMovingAverage( 0.9999, tf_global_step) tf.add_to_collection( tf.GraphKeys.UPDATE_OPS, variable_averages.apply(moving_average_variables)) decay_steps = FLAGS.num_epochs_per_decay * num_samples_per_epoch / FLAGS.batch_size lr = tf.train.exponential_decay(FLAGS.learning_rate, tf_global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) lr = tf.squeeze(lr) slim.summaries.add_scalar_summary(lr, 'learning_rate') # Specify the optimization scheme: with tf.control_dependencies([weighted_total_loss, data_util]): # Set up training. trainable_variables = tf.trainable_variables() trainable_variables = tf.contrib.framework.filter_variables( trainable_variables, exclude_patterns=['mentornet']) grads = tf.gradients(weighted_total_loss, trainable_variables) optimizer = tf.train.MomentumOptimizer(lr, momentum=0.9) apply_op = optimizer.apply_gradients( zip(grads, trainable_variables), global_step=tf_global_step, name='train_step') train_ops = [apply_op ] + resnet.extra_train_ops + tf.get_collection( tf.GraphKeys.UPDATE_OPS) train_op = tf.group(*train_ops) # Parameter restore setup if FLAGS.trained_mentornet_dir is not None: ckpt_model = FLAGS.trained_mentornet_dir if os.path.isdir(FLAGS.trained_mentornet_dir): ckpt_model = tf.train.latest_checkpoint(ckpt_model) # Fix the mentornet parameters variables_to_restore = slim.get_variables_to_restore( include=['mentornet', 'mentornet_inputs']) iassign_op1, ifeed_dict1 = tf.contrib.framework.assign_from_checkpoint( ckpt_model, variables_to_restore) # Create an initial assignment function. def init_assign_fn(sess): tf.logging.info('Restore using customer initializer %s', '.' * 10) sess.run(iassign_op1, ifeed_dict1) else: init_assign_fn = None tf.logging.info('-' * 20 + 'MentorMix' + '-' * 20) tf.logging.info('loss_p_percentile=%3f', FLAGS.loss_p_percentile) tf.logging.info('mixup_alpha=%d', FLAGS.mixup_alpha) tf.logging.info('-' * 20) saver = tf.train.Saver(max_to_keep=10, keep_checkpoint_every_n_hours=24) # Run training. slim.learning.train(train_op=train_op, train_step_fn=resnet_train_step, logdir=FLAGS.train_log_dir, master=FLAGS.master, is_chief=FLAGS.task == 0, saver=saver, number_of_steps=max_step_run, init_fn=init_assign_fn, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs)
def train(hps): """Training loop.""" images, labels = cifar_input.build_input(FLAGS.dataset, FLAGS.train_data_path, hps.batch_size, FLAGS.mode) model = resnet_model.ResNet(hps, images, labels, FLAGS.mode) model.build_graph() param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=tf.contrib.tfprof.model_analyzer. TRAINABLE_VARS_PARAMS_STAT_OPTIONS) sys.stdout.write('total_params: %d\n' % param_stats.total_parameters) tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS) truth = tf.argmax(model.labels, axis=1) predictions = tf.argmax(model.predictions, axis=1) precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth))) summary_hook = tf.train.SummarySaverHook( save_steps=100, output_dir=FLAGS.train_dir, summary_op=tf.summary.merge( [model.summaries, tf.summary.scalar('Precision', precision)])) logging_hook = tf.train.LoggingTensorHook(tensors={ 'step': model.global_step, 'loss': model.cost, 'precision': precision }, every_n_iter=100) class _LearningRateSetterHook(tf.train.SessionRunHook): """Sets learning_rate based on global step.""" def begin(self): self._lrn_rate = 0.1 def before_run(self, run_context): return tf.train.SessionRunArgs( model.global_step, # Asks for global step value. feed_dict={model.lrn_rate: self._lrn_rate}) # Sets learning rate def after_run(self, run_context, run_values): train_step = run_values.results if train_step < 40000: self._lrn_rate = 0.1 elif train_step < 60000: self._lrn_rate = 0.01 elif train_step < 80000: self._lrn_rate = 0.001 else: self._lrn_rate = 0.0001 with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.log_root, hooks=[logging_hook, _LearningRateSetterHook()], chief_only_hooks=[summary_hook], # Since we provide a SummarySaverHook, we need to disable default # SummarySaverHook. To do that we set save_summaries_steps to 0. save_summaries_steps=0, config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(model.train_op)
def train(self, hps): config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.InteractiveSession(config=config) img = tf.placeholder(tf.float32, shape=[self.batch_num, 32, 32, 3]) labels = tf.placeholder(tf.int32, shape=[ self.batch_num, ]) model = resnet_model.ResNet(hps, img, labels, 'train') model.build_graph() merged = model.summaries train_writer = tf.summary.FileWriter("/tmp/train_log", sess.graph) sess.run(tf.global_variables_initializer()) print('Done initializing variables') print('Running model...') # Set default learning rate for scheduling lr = args.lr for j in range(self.num_epoch): print('Epoch {}'.format(j + 1)) # Decrease learning rate every args.lr_schedule epoch # By args.lr_factor if (j + 1) % args.lr_schedule == 0: lr *= args.lr_factor for i in range(self.num_iter): batch = self.next_batch(self.batch_num) feed_dict = { img: batch[0], labels: batch[1], model.lrn_rate: lr } _, l, ac, summary, lr = sess.run([ model.train_op, model.cost, model.acc, merged, model.lrn_rate ], feed_dict=feed_dict) train_writer.add_summary(summary, i) # if i % 200 == 0: print('step', i + 1) print('Training loss', l) print('Training accuracy', ac) print('Learning rate', lr) print('Running evaluation...') test_loss, test_acc, n_batch = 0, 0, 0 for batch in tl.iterate.minibatches(inputs=self.x_valid, targets=self.y_valid, batch_size=self.batch_num, shuffle=False): feed_dict_eval = {img: batch[0], labels: batch[1]} loss, ac = sess.run([model.cost, model.acc], feed_dict=feed_dict_eval) test_loss += loss test_acc += ac n_batch += 1 tot_test_loss = test_loss / n_batch tot_test_acc = test_acc / n_batch print(' Test loss: {}'.format(tot_test_loss)) print(' Test accuracy: {}'.format(tot_test_acc)) print('Completed training and evaluation.') test_predicted = [] for batch in tl.iterate.minibatches(inputs=self.x_test, targets=self.y_test, batch_size=50, shuffle=False): feed_dict_eval = {img: batch[0]} preds = sess.run(model.predict, feed_dict=feed_dict_eval) for pred in preds: test_predicted.append(pred) csv_content = [["ID", "Label"]] for ind, data in enumerate(test_predicted): csv_content.append([ind + 1, data + 1]) with open("cifar_prediction.csv", "w") as f: writer = csv.writer(f) writer.writerows(csv_content)