def train(): with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() print ("Global step", global_step) images, labels = model.distorted_inputs() logits = model.inference(images) loss = model.loss(logits, labels) train_op = model.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): def begin(self): self._step = -1 def before_run(self, run_context): self._step += 1 self._start_time = time.time() return tf.train.SessionRunArgs(loss) def after_run(self, run_context, run_values): duration = time.time() - self._start_time loss_value = run_values.results if self._step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step /duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir = FLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook()], config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def tower_loss(scope): """Calculate the total loss on a single tower running the cnn model. Args: scope: unique prefix string identifying the cnn tower, e.g. 'tower_0' Returns: Tensor of shape [] containing the total loss for a batch of data """ images, labels = model.distorted_inputs(args.m, train_chunk_name, args.bs, number_of_labels, NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN, args.d) labels['observed'] = 1 - labels['censored'] # images, labels = calc_at_risk(images, labels, args.m) if args.m == 'image_genome': labels['genomics'] = tf.cast(labels['genomics'], tf.float32) logits = model.inference(images, labels, args.kp, args.m, args.bs) _, risk_diff = model.log_sigmoid_loss(logits, labels, args.bs) losses = tf.get_collection('losses', scope) total_loss = tf.add_n(losses, name='total_loss') for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % model.TOWER_NAME, '', l.op.name) tf.summary.scalar(loss_name, l) # "labels" is a dictionary in which the keywords are among: 'idx' for # patients' indexes, 'survival', 'censored', 'observed', 'idh', 'codel', # 'copynum' for copy numbers, 'at_risk' return total_loss, logits, labels, images, risk_diff
def train(): """Train datasets for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for model. images, labels = model.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = model.inference(images) # Calculate loss. loss = model.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = model.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.global_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() # Build an initialization operation to run below. init = tf.global_variables_initializer() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement))# log_device_placement=True,该参数表示程序会将运行每一个操作的设备输出到屏幕 sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, graph_def=sess.graph_def) for step in range(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(): with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) images, labels = model.distorted_inputs() logits = model.inference(images) loss = model.loss(logits, labels) train_op = model.train(loss, global_step) saver = tf.train.Saver(tf.all_variables()) summary_op = tf.merge_all_summaries() sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if FLAGS.resume_training and ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) current_step = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) else: current_step = 0 init = tf.initialize_all_variables() sess.run(init) tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(SUMMARY_DIR, graph_def=sess.graph_def) for step in xrange(current_step, FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f' '(%.1f examples/sec; %.3f' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 50 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if step % 100 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(): with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) images, labels = model.distorted_inputs() logits = model.inference(images) loss = model.loss(logits, labels) train_op = model.train(loss, global_step) saver = tf.train.Saver(tf.all_variables()) summary_op = tf.merge_all_summaries() sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if FLAGS.resume_training and ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) current_step = int(ckpt.model_checkpoint_path .split('/')[-1].split('-')[-1]) else: current_step = 0 init = tf.initialize_all_variables() sess.run(init) tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(SUMMARY_DIR, graph_def=sess.graph_def) for step in xrange(current_step, FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f' '(%.1f examples/sec; %.3f' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 50 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if step % 100 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. # Force input pipeline to CPU:0 to avoid operations sometimes ending up on # GPU and resulting in a slow down. with tf.device('/cpu:0'): lefts, disps, confs = model.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. predicted = model.inference(lefts, disps) # Calculate loss. loss = model.loss(predicted, confs) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = model.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook() ], config=tf.ConfigProto(log_device_placement=FLAGS. log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def train(): with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) images, labels = cifar10.distorted_inputs() logits = cifar10.inference(images, train=True) loss = cifar10.loss(logits, labels) train_op = cifar10.train(loss, global_step) saver = tf.train.Saver(tf.all_variables()) summary_op = tf.merge_all_summaries() init = tf.initialize_all_variables() sess = tf.Session() sess.run(init) tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print('Restoring from checkpoint') for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss=NaN' if step % 100 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) print('step %d loss = %f (%.1f examples/sec, %.3f sec/batch)' % (step, loss_value, examples_per_sec, sec_per_batch)) summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def tower_loss(scope): images, labels = model.distorted_inputs() logits = model.inference(images) _ = model.loss(logits, labels) losses = tf.get_collection('losses', scope) total_loss = tf.add_n(losses, name='total_loss') for l in losses + [total_loss]: loss_name = re.sub('%s_[0-9]*/' % model.TOWER_NAME, '', l.op.name) tf.summary.scalar(loss_name, l) return total_loss
def train(use_vgg=False): with tf.Graph().as_default(): global_step = tf.train.get_or_create_global_step() images, labels = model.distorted_inputs() if use_vgg: logits = model_vgg16.inference(images, FLAGS.vgg_model) loss = model_vgg16.loss(logits, labels) else: logits = model.inference(images) loss = model.loss(logits, labels) train_op = tf.train.MomentumOptimizer(1e-3, momentum=0.9).minimize( loss, global_step=global_step) saver = tf.train.Saver(tf.all_variables()) init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start queue runners tf.train.start_queue_runners(sess=sess) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time if step % 10 == 0: examples_per_sec = FLAGS.batch_size / duration sec_per_batch = float(duration) format_str = ( 'step %d,loss = %.2f (%.1f examples/sec; %.3f sec/batch)') print(format_str % (step, loss_value, examples_per_sec, sec_per_batch)) if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: if use_vgg: checkpoint_path = os.path.join(FLAGS.train_dir, 'vgg_model.ckpt') else: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def evaluate(use_vgg=False): with tf.Graph().as_default(): eval_data = FLAGS.eval_data == 'test' images, labels = model.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = model.inference(images) # Calculate predictions. top_k_op = tf.nn.in_top_k(logits, labels, 1) # Restore the moving average version of the learned variables for eval. variable_averages = tf.train.ExponentialMovingAverage( model.MOVING_AVERAGE_DECAY) variables_to_restore = variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) while True: eval_once(saver, top_k_op, use_vgg) if FLAGS.run_once: break time.sleep(FLAGS.eval_interval_secs)
def train(): print('[Training Configuration]') print('\tTrain dir: %s' % FLAGS.train_dir) print('\tTraining max steps: %d' % FLAGS.max_steps) print('\tSteps per displaying info: %d' % FLAGS.display) print('\tSteps per testing: %d' % FLAGS.test_interval) print('\tSteps per saving checkpoints: %d' % FLAGS.checkpoint_interval) print('\tGPU memory fraction: %f' % FLAGS.gpu_fraction) """Train aPascal for a number of steps.""" with tf.Graph().as_default(): init_step = 0 global_step = tf.Variable(0, trainable=False) # Get images and labels for aPascal. train_images, train_labels = model.distorted_inputs('train') test_images, test_labels = model.inputs('eval') # Build a Graph that computes the predictions from the inference model. images = tf.placeholder(tf.float32, [FLAGS.batch_size, model.IMAGE_WIDTH, model.IMAGE_WIDTH, 3]) labels = tf.placeholder(tf.int32, [FLAGS.batch_size, model.NUM_ATTRS]) probs = model.inference(images) # Calculate loss. (cross_entropy loss) loss, acc = model.loss_acc(probs, labels) tf.scalar_summary("accuracy", acc) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op, lr = model.train(loss, global_step) # Build the summary operation based on the TF collection of Summaries. train_summary_op = tf.merge_all_summaries() # Loss and accuracy summary used in test phase) loss_summary = tf.scalar_summary("test/loss", loss) acc_summary = tf.scalar_summary("test/accuracy", acc) test_summary_op = tf.merge_summary([loss_summary, acc_summary]) # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.gpu_fraction), log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Create a saver. saver = tf.train.Saver(tf.all_variables(), max_to_keep=10000) ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if ckpt and ckpt.model_checkpoint_path: print('\tRestore from %s' % ckpt.model_checkpoint_path) # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) init_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) else: print('No checkpoint file found. Start from the scratch.') # if finetune, load variables of the final predication layers # from pretrained model if FLAGS.finetune: base_variables = tf.trainable_variables()[:-2*model.NUM_ATTRS] base_saver = tf.train.Saver(base_variables, max_to_keep=10000) ckpt = tf.train.get_checkpoint_state(FLAGS.pretrained_dir) print('Initial checkpoint: ' + ckpt.model_checkpoint_path) base_saver.restore(sess, ckpt.model_checkpoint_path) # Start the queue runners. tf.train.start_queue_runners(sess=sess) if not os.path.exists(FLAGS.train_dir): os.mkdir(FLAGS.train_dir) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir) # Training!! for step in xrange(init_step, FLAGS.max_steps): start_time = time.time() try: train_images_val, train_labels_val = sess.run([train_images, train_labels]) _, lr_value, loss_value, acc_value, train_summary_str = sess.run([train_op, lr, loss, acc, train_summary_op], feed_dict={images:train_images_val, labels:train_labels_val}) except tf.python.framework.errors.InvalidArgumentError: embed() duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % FLAGS.display == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: (Training) step %d, loss=%.4f, acc=%.4f, lr=%f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, acc_value, lr_value, examples_per_sec, sec_per_batch)) summary_writer.add_summary(train_summary_str, step) if step % FLAGS.test_interval == 0: test_images_val, test_labels_val = sess.run([test_images, test_labels]) loss_value, acc_value, test_summary_str = sess.run([loss, acc, test_summary_op], feed_dict={images:test_images_val, labels:test_labels_val}) format_str = ('%s: (Test) step %d, loss=%.4f, acc=%.4f') print (format_str % (datetime.now(), step, loss_value, acc_value)) summary_writer.add_summary(test_summary_str, step) # Save the model checkpoint periodically. if step % FLAGS.checkpoint_interval == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(): """Train FSRCNN for a number of steps.""" os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Create an optimizer that performs gradient descent. opt = tf.train.MomentumOptimizer(FLAGS.lr, 0.9) # Determine number of GPUs to use. num_gpus = len(FLAGS.gpu.split(',')) # Get images and labels for FSRCNN. images, labels = model.distorted_inputs() batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * num_gpus) # Calculate the gradients for each model tower. tower_grads = [] with tf.variable_scope(tf.get_variable_scope()): for i in xrange(num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (model.TOWER_NAME, i)) as scope: # Dequeues one batch for the GPU image_batch, label_batch = batch_queue.dequeue() # Calculate the loss for one tower of the model. This function # constructs the entire model but shares the variables across # all towers. loss = tower_loss(scope, image_batch, label_batch) # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) # Calculate the gradients for the batch of data on this tower. if FLAGS.quantize: tensor_list = tf.get_collection('quantize', scope) var_list = tf.trainable_variables() grads = zip(tf.gradients(loss, tensor_list), var_list) else: grads = opt.compute_gradients(loss) # Keep track of the gradients across all towers. tower_grads.append(grads) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = average_gradients(tower_grads) # Add a summary to track the learning rate. summaries.append(tf.summary.scalar('learning_rate', FLAGS.lr)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.summary.histogram(var.op.name, var)) # Group all updates to into a single train op. train_op = tf.group(apply_gradient_op) # Create a saver. saver = tf.train.Saver(tf.global_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.summary.merge(summaries) # summary_op = tf.summary.merge_all() # Build an initialization operation to run below. init = tf.global_variables_initializer() global_step = 0 # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) config.gpu_options.per_process_gpu_memory_fraction = 0.8 # config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(init) # Restore the model if reload is True. if FLAGS.reload: ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if ckpt and ckpt.model_checkpoint_path: # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) # Assuming model_checkpoint_path looks something like: # /my-favorite-path/cifar10_train/model.ckpt-0, # extract global_step from it. global_step = ckpt.model_checkpoint_path.split('/')[-1].split( '-')[-1] # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) for step in xrange(int(global_step), FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size * num_gpus examples_per_sec = num_examples_per_step / duration sec_per_batch = duration / num_gpus format_str = ( '%s: step %d, loss = %.4f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 5000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)