def tower_loss(scope): """Calculate the total loss on a single tower running the CIFAR model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' Returns: Tensor of shape [] containing the total loss for a batch of data """ # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build inference Graph. logits = cifar10.inference(images) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = cifar10.loss(logits, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Compute the moving average of all individual losses and the total loss. loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(losses + [total_loss]) # Attach a scalar summmary to all individual losses and the total loss; do the # same for the averaged version of the losses. for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) # Name each loss as '(raw)' and name the moving average version of the loss # as the original loss name. tf.scalar_summary(loss_name +' (raw)', l) tf.scalar_summary(loss_name, loss_averages.average(l)) with tf.control_dependencies([loss_averages_op]): total_loss = tf.identity(total_loss) return total_loss
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) images, labels = cifar10.distorted_inputs() logits = cifar10.inference(images) loss = cifar10.loss(logits, labels) # loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, labels)) # train_op = tf.train.GradientDescentOptimizer(1e-2).minimize(loss) train_op = cifar10.train(loss, global_step) top_k_op = tf.nn.in_top_k(logits, labels, 1) saver = tf.train.Saver(tf.all_variables()) init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) tf.train.start_queue_runners(sess=sess) true_count = 0 for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value, precisions = sess.run([train_op, loss, top_k_op]) true_count += np.sum(precisions) if step % 10 == 0: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) duration = time.time() - start_time print(' step %d, loss = %.3f, acc = %.3f, dur = %.2f' % (step, loss_value, true_count/(FLAGS.batch_size*10), duration)) true_count = 0
def tower_loss(scope, images, labels): """Calculate the total loss on a single tower running the CIFAR model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' images: Images. 4D tensor of shape [batch_size, height, width, 3]. labels: Labels. 1D tensor of shape [batch_size]. Returns: Tensor of shape [] containing the total loss for a batch of data """ # Build inference Graph. logits = cifar10.inference(images) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = cifar10.loss(logits, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Attach a scalar summary to all individual losses and the total loss; do the # same for the averaged version of the losses. for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) tf.summary.scalar(loss_name, l) return total_loss
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() testImg, testlabels = cifar10.inputs(eval_data=True) # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) test_pre = cifar10.inference(testImg,test=True) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time if step % 10 == 0: print ('loss '+str(loss_value)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 10 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) #eval if step%10==0: cifar10.accuracy(test_pre,testlabels)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), "Model diverged with loss = NaN" if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = "%s: step %d, loss = %.2f (%.1f examples/sec; %.3f " "sec/batch)" print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt") saver.save(sess, checkpoint_path, global_step=step)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.train.get_or_create_global_step() # Get images and labels for CIFAR-10. # Force input pipeline to CPU:0 to avoid operations sometimes ending up on # GPU and resulting in a slow down. with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook()], config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def tower_loss(scope): """Calculate the total loss on a single tower running the CIFAR model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' Returns: Tensor of shape [] containing the total loss for a batch of data """ # Build a Graph that computes the logits predictions from the # inference model. if tfFLAGS.network == 1: images, labels = cifar10.distorted_inputs() logits, fc1_w, fc1_b, fc2_w, fc2_b = MyModel.inference(images) else: images, labels = cifar10.distorted_inputs() logits, fc1_w, fc1_b, fc2_w, fc2_b = MyModel2.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # L2 regularization for the fully connected parameters. regularizers = (tf.nn.l2_loss(fc1_w) + tf.nn.l2_loss(fc1_b) + tf.nn.l2_loss(fc2_w) + tf.nn.l2_loss(fc2_b)) # Add the regularization term to the loss. loss += 5e-4 * regularizers # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = cifar10.loss(logits, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Attach a scalar summary to all individual losses and the total loss; do the # same for the averaged version of the losses. for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % tfFLAGS.TOWER_NAME, '', l.op.name) tf.summary.scalar(loss_name, l) return total_loss
def tower_loss(scope, images, labels): """Calculate the total loss on a single tower running the CIFAR model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' images: Images. 4D tensor of shape [batch_size, height, width, 3]. labels: Labels. 1D tensor of shape [batch_size]. Returns: Tensor of shape [] containing the total loss for a batch of data """ # Build inference Graph. print('>>>>> input original = ', images) # in case of images less than or greater than 227x227 # images = tf.image.resize_images(images, [227,227] ) # print('>>>>> input resized = ',images) logits = cifar10.inference(images, 0.5) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = cifar10.loss(logits, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # to calculate accuracy for batch top_k_op = tf.nn.in_top_k(logits, labels, 1) top_k_op = tf.cast(top_k_op, tf.int32) acc_batch = tf.reduce_sum(top_k_op) tf.summary.scalar(name="acc_batch", tensor=acc_batch / FLAGS.batch_size) # 1-off accuracy for batch print('labels = ', labels) labels = tf.cast(labels, tf.int64) #tf.summary.text(name="labels",tensor=tf.as_string(labels)) #tf.summary.text(name="logits",tensor=tf.as_string(logits)) argmaxlogits = tf.argmax(logits, axis=-1) #tf.summary.text(name="argmaxlogits ",tensor=tf.as_string(argmaxlogits )) print('>> argmaxlogits = ', argmaxlogits) absdiff = (tf.abs(labels - argmaxlogits) <= 1) #tf.summary.text(name="absdiff", tensor=tf.as_string(absdiff)) acc_1off = tf.reduce_sum(tf.cast(absdiff, tf.int64)) tf.summary.scalar(name="acc_1off", tensor=acc_1off / FLAGS.batch_size) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Attach a scalar summary to all individual losses and the total loss; do the # same for the averaged version of the losses. for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) tf.summary.scalar(loss_name, l) return total_loss
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook() ], config=tf.ConfigProto(log_device_placement=FLAGS. log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def train(): with tf.Graph().as_default(): # get global step global_step = tf.train.get_or_create_global_step() # get data through cpu with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs() # get loss and logit # logits = cifar10.inference(images=images, r=low_ranks) logits = cifar10.inference(images=images,r=low_ranks) loss = cifar10.loss(logits=logits, labels=labels) # set train_op train_op = cifar10.train(loss, global_step) for v in tf.trainable_variables(): print(v) nonzero = tf.count_nonzero(tf.get_collection('sparse_components')[-1]) # define a LoggerHook to log something # clean_list = tf.get_collection('sparse_components') # clean_list = clean_s(clean_list) # clean_op = [c.op for c in clean_list] class _LoggerHook(tf.train.SessionRunHook): """ log session and runtime info """ def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results example_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ('%s: step %d, loss = %.6f (%.1f examples/sec;' '%.3f sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, example_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook()], config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 def before_run(self, run_context): self._step += 1 self._start_time = time.time() return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): duration = time.time() - self._start_time loss_value = run_values.results if self._step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) global step_no step_no = self._step with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook()], config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement, inter_op_parallelism_threads=4,intra_op_parallelism_threads=0)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op) """run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
def train(): """Train CIFAR-10 for a number of steps.""" g1 = tf.Graph() with g1.as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) grads = cifar10.train_part1(loss, global_step) only_gradients = [g for g,_ in grads] only_vars = [v for _,v in grads] placeholder_gradients = [] #with tf.device("/gpu:0"): for grad_var in grads : placeholder_gradients.append((tf.placeholder('float', shape=grad_var[0].get_shape()) ,grad_var[1])) feed_dict = {} for i,grad_var in enumerate(grads): feed_dict[placeholder_gradients[i][0]] = np.zeros(placeholder_gradients[i][0].shape) train_op = cifar10.train_part2(global_step,placeholder_gradients) sess = tf.Session() sess.run(tf.global_variables_initializer()) feeds = [] print("Reached here") for i,grad_var in enumerate(grads): feeds.append(placeholder_gradients[i][0]) # Partial Run print("Reached here", len(feeds)) for x in feeds: print(x,) h = sess.partial_run_setup([only_gradients, train_op], feeds) print("Reached here") for i in xrange(10): res_grads = sess.partial_run(h, only_gradients, feed_dict = feed_dict) feed_dict = {} for i,grad_var in enumerate(res_grads): feed_dict[placeholder_gradients[i][0]] = res_grads[i] res_train_op = sess.partial_run(h, train_op, feed_dict=feed_dict)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.train.get_or_create_global_step() # Get images and labels for CIFAR-10. # Force input pipeline to CPU:0 to avoid operations sometimes ending up on # GPU and resulting in a slow down. with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() self._start_time = current_time for step in range(0, FLAGS.max_steps + 1, FLAGS.log_frequency): print(str(step)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=step), tf.train.NanTensorHook(loss), _LoggerHook() ], config=tf.ConfigProto(log_device_placement=FLAGS. log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op) # evaluate test data cifar10_eval.evaluate() # evaluate train data evaluate()
def train(): """ Train CIFAR-10 for a number of steps :return: """ with tf.Graph().as_default(): global_step = tf.train.get_or_create_global_step() # 获取CIFAR-10的images和labels # 让CPU专注流输入,避免GPU处理完,导致停顿 with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs() # 预测结果 以及 loss logits = cifar10.inference(images) loss = cifar10.loss(logits, labels) train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """ log loss and runtime. """ def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook()], config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def tower_loss(scope): """ :param scope: 我们需要为每个GPU生成单独的结构完全一致的网络,由scope标识 :return: """ images, labels = cifar10.distored_inputs() logits = cifar10.inference(images) _ = cifar10.loss(logits, labels) losses = tf.get_collection('losses', scope) total_loss = tf.add_n(losses, name='total_loss') return total_loss
def train(): """ CIFAR10训练函数 """ with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # 从数据中读取图像和标签. images, labels = cifar10.distorted_inputs() # 建立一个图用来计算模型的预测结果 logits = cifar10.inference(images) # 计算误差 loss = cifar10.loss(logits, labels) # 构建图用一个batch的训练数据来训练模型并更新参数 train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """ 记录误差和运行时间 """ def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) # 计算误差值 def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ( '%s: step %d, loss = %.5f (%.1f examples/sec; %.3f sec/batch)' ) print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook() ], config=tf.ConfigProto(log_device_placement=FLAGS. log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def run_training(): with tf.Graph().as_default(), tf.device('/gpu:0'): global_step = tf.Variable(0, trainable=False) with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs() images_placeholder, labels_placeholder = placeholder_inputs( FLAGS.batch_size, cifar10.IMAGE_SIZE) logits = cifar10.inference(images_placeholder) losses_dict = cifar10.loss(logits, labels_placeholder) moving_averages_op = cifar10.add_summaries_and_moving_avgs( losses_dict, global_step) lbfgs_optimizer = customized_optimizer.CustomizedOptimizerInterface( global_step=global_step, loss_dict=losses_dict, data_fetches=[images, labels], data_placeholders=(images_placeholder, labels_placeholder), maxiter=FLAGS.max_steps) saver = tf.train.Saver(tf.global_variables(), max_to_keep=25) summary_op = tf.summary.merge_all() init = tf.global_variables_initializer() with tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement, gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=0.4))) as sess: sess.run(init) coordinator = tf.train.Coordinator() try: threads = tf.train.start_queue_runners(sess=sess, coord=coordinator) lbfgs_optimizer.minimize(session=sess, moving_averages_op=moving_averages_op, summary_op=summary_op, saver=saver, step_callback=step_callback) except Exception as e: coordinator.request_stop(e) coordinator.request_stop() coordinator.join(threads, stop_grace_period_secs=10)
def train(): # todo:这句话啥意识 with tf.Graph().as_default(): # todo:啥意思 global_step = tf.train.get_or_create_global_step() with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs() # 推测模型 logits = cifar10.inference(images) loss = cifar10.loss(logits, labels) train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """ logs loss and runtime """ def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) def after_run( self, run_context, # pylint: disable=unused-argument run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results example_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ( '%s: step %d,loss=%.2f (%.1f example/sec);%.3f sec/batch' ) print(format_str) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook() ], config=tf.ConfigProto(log_device_placement=FLAGS. log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def train(): """Train CIFAR-10 for a number of steps.""" sess = tf.InteractiveSession() #sess = tf_debug.LocalCLIDebugWrapperSession(sess) # Get images and labels for CIFAR-10. with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs() # Define all the fixed point variables we will be using later cifar10.initialize_fix_point_variables() # Build a Graph that computes the logits predictions from the inference model logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, 0.05) # Update fixed point conversion parameters when needed update_fix_pt_ops = cifar10.update_fix_point_accuracy() # Merge all the summaries and write them out to # FLAGS.log_dir merged_summary = tf.summary.merge_all() train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph) test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test') # init all variables tf.global_variables_initializer().run() # create a saver for checkpoints saver = tf.train.Saver(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)) # needed on interactive session so it doesn't hang tf.train.start_queue_runners() for i in range(FLAGS.max_steps): summary, _ = sess.run([merged_summary, train_op]) train_writer.add_summary(summary, i) # summary if (i % 10 == 0): saver.save(sess, FLAGS.log_dir + '/checkpoint', global_step=i) if (i % 5 == 0): sess.run([update_fix_pt_ops]) print('Step: %s, Loss: %s' % (i, loss.eval())) train_writer.close()
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.global_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, graph_def=sess.graph_def) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def tower_loss(scope): """Calculate the total loss on a single tower running the CIFAR model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' Returns: Tensor of shape [] containing the total loss for a batch of data """ # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build inference Graph. logits = cifar10.inference(images) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = cifar10.loss(logits, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Compute the moving average of all individual losses and the total loss. loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') print('first one') loss_averages_op = loss_averages.apply(losses) print('second one') loss_averages_op = loss_averages.apply([total_loss]) print('donezo') loss_averages_op = loss_averages.apply(losses + [total_loss]) print('real') # Attach a scalar summary to all individual losses and the total loss; do the # same for the averaged version of the losses. for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) # Name each loss as '(raw)' and name the moving average version of the loss # as the original loss name. tf.summary.scalar(loss_name + ' (raw)', l) tf.summary.scalar(loss_name, loss_averages.average(l)) with tf.control_dependencies([loss_averages_op]): total_loss = tf.identity(total_loss) return total_loss
def train(): with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) images, labels = cifar10.distorted_inputs() logits = cifar10.inference(images) loss = cifar10.loss(logits, labels) train_op = cifar10.train(loss, global_step) saver = tf.train.Saver(tf.all_variables()) summary_op = tf.merge_all_summaries() init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_model, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def evaluate(): """Eval CIFAR-10 for a number of steps.""" with tf.Graph().as_default() as g: eval_data = FLAGS.eval_data == 'test' images, labels = cifar10.inputs(eval_data=eval_data) logits = cifar10.inference(images) top_k_op = tf.nn.in_top_k(logits, labels, 1) loss = cifar10.loss(logits, labels) saver = tf.train.Saver(tf.all_variables()) while True: eval_once(saver, top_k_op, loss) if FLAGS.run_once: break time.sleep(FLAGS.eval_interval_secs)
def __init__(self): global pwd os.chdir(pwd) train_data = np.load('trainingdata.npz') self.images = train_data['images'] self.labels = train_data['labels'] self.counter = 0 self.session = None with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() self.images_ph = gen_ph(self.images[0], name='images_ph') self.labels_ph = gen_ph(self.labels[0], name='labels_ph') logits = cifar10.inference_divided(self.images_ph) self.loss = cifar10.loss(logits, self.labels_ph) num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY) lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE, global_step, decay_steps, cifar10.LEARNING_RATE_DECAY_FACTOR, staircase=True) opt = tf.train.GradientDescentOptimizer(lr) # opt = tf.train.MomentumOptimizer(lr, 0.9) grads_pair_list = opt.compute_gradients(self.loss) # self.grads = [i[0] for i in grads_pair_list] # self.phs = [gen_ph(i[1]) for i in grads_pair_list] # variables = [i[1] for i in grads_pair_list] # self.train_op = opt.apply_gradients(zip(self.phs, variables), global_step=global_step) self.train_op = opt.apply_gradients(grads_pair_list, global_step=global_step) self.mon_sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) tf.global_variables_initializer().run(session=self.mon_sess) tf.summary.FileWriter("tb", self.mon_sess.graph) self.variable = ray.experimental.TensorFlowVariables( self.loss, self.mon_sess)
def train(): with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) images, labels = cifar10.distorted_inputs() logits = cifar10.inference(images) loss = cifar10.loss(logits, labels) train_op = cifar10.train(loss, global_step) saver = tf.train.Saver(tf.all_variables()) summary_op = tf.merge_all_summaries() init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)) sess.run(init) tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, graph_def=sess.graph_def) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), "Model diverged with loss = NaN" if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = "%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)" print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt") saver.save(sess, checkpoint_path, global_step=step)
def tower_loss(scope): # 获取数据增强后的images和labels images, labels = cifar10.distorted_inputs() # 生成卷积网络,GPU共享参数模型 logits = cifar10.inference(images) # 计算损失函数 _ = cifar10.loss(logits, labels) # 获取当前GPU上的loss losses = tf.get_collection('losses', scope) # 损失叠加计算总损失 total_loss = tf.add_n(losses, name='total_loss') return total_loss
def train(): with tf.Graph().as_default(): images, labels = cifar10.distorted_inputs() logits = cifar10.inference(images) loss = cifar10.loss(logits, labels) init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) tf.train.start_queue_runners(sess=sess) for step in xrange(FLAGS.max_steps): los = sess.run([loss]) #print(type(los)) print(los[0])
def train(): with tf.Graph().as_default(): images, labels = cifar10.distorted_inputs() logits = cifar10.inference(images) loss = cifar10.loss(logits, labels) global_step = tf.Variable(0, trainable=False) train_op = cifar10.train(loss, global_step=global_step) summary_op = tf.merge_all_summaries() init = tf.global_variables_initializer() memlim = tf.ConfigProto() memlim.gpu_options.allow_growth = True sess = tf.Session(config=memlim) sess.run(init) tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value) if step % 5 == 0: examples_per_sec = FLAGS.batch_size / duration sec_per_batch = float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)' ) print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step)
def tower_loss(scope, images, labels): ''' 当一个tower运行CIFAR模型时,计算total loss @param scope: 独特的前缀字符串表明CIFAR tower, 例如'tower_0' @param images: @param labels: @return: 一批次数据的total loss ''' logits = cifar10.inference(images) _ = cifar10.loss(logits, labels) # 从当前tower中取出‘losses’的全部元素,构成一个列表 losses = tf.get_collection('losses', scope) # tf.add_n([p1, p2, p3, ...])函数是实现一个列表元素的相加 total_loss = tf.add_n(losses, name='total_loss') for l in losses + [total_loss]: loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) tf.summary.scalar(loss_name, l) return total_loss
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.train.get_or_create_global_step() # Get images and labels for CIFAR-10. # Force input pipeline to CPU:0 to avoid operations sometimes ending up on # GPU and resulting in a slow down. with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) sess = tf.InteractiveSession() saver = tf.train.Saver() tf.global_variables_initializer().run() # start thread queue to speed up for data augmentation tf.train.start_queue_runners() start = time.time() for step in range(FLAGS.max_steps): _loss, _ = sess.run([loss, train_op]) if step % FLAGS.log_frequency == 0: duration = time.time() - start sec_per_batch = float(duration) / FLAGS.log_frequency print('{}: step:{:6d} loss:{:.2f} {:.2f} sec/batch'.format( datetime.now(), step, _loss, sec_per_batch)) start = time.time() if step % 100 == 0: ckpt = os.path.join( FLAGS.train_dir, 'model_step_{}_loss_{:.1f}'.format(step, _loss)) saver.save(sess, ckpt)
def evaluate(): """Eval CIFAR-10 for a number of steps.""" with tf.Graph().as_default() as g: # Get images and labels for CIFAR-10. eval_data = FLAGS.eval_data == "test" print(eval_data) images, labels, ground_truth = cifar10.inputs(eval_data=eval_data) # Build a Graph that computes the logits predictions from the # inference model. logits, _ = cifar10.inference(images) print(logits) print(logits.get_shape()) print("after inference node creation") loss = cifar10.loss(logits, labels) accuracy, precision, accuracies = cifar10.accuracy(logits, ground_truth) labels = tf.cast(labels, tf.int64) label_shape = labels.get_shape().as_list() reshaped_labels = tf.reshape(labels, [label_shape[0] * label_shape[1] * label_shape[2]]) logits_shape = logits.get_shape().as_list() reshaped_logits = tf.reshape(logits, [logits_shape[0] * logits_shape[1] * logits_shape[2], logits_shape[3]]) # Calculate predictions. # top_k_op = tf.nn.in_top_k(logits, labels, 1) # top_k_op = tf.nn.in_top_k(reshaped_logits, reshaped_labels, 1) # Restore the moving average version of the learned variables for eval. variable_averages = tf.train.ExponentialMovingAverage(cifar10.MOVING_AVERAGE_DECAY) variables_to_restore = variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, g) while True: print("evaluate:") eval_once(saver, summary_writer, summary_op, accuracy, precision, accuracies) if FLAGS.run_once: break time.sleep(FLAGS.eval_interval_secs)
def tower_loss(scope, model): """Calculate the total loss on a single tower running the CIFAR model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' Returns: Tensor of shape [] containing the total loss for a batch of data """ # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build inference Graph. logits = cifar10.inference(images) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = cifar10.loss(logits, labels) total_loss = model.loss return total_loss
def tower_loss(scope): """Calculate the total loss on a single tower. Args: scope: unique prefix string identifying a CIFAR tower, ex: 'tower_0' Returns: Tensor of shape [] containing the total loss for a batch of data """ # Get images and labels for CIFAR-10 images, labels = cifar10.distorted_inputs() # Build inference graph logits = cifar10.inference(images) # Build the portion of the graph calculating the losses _ = cifar10.loss(logits, labels) # Assemble all of the losses for the current tower only losses = tf.get_collection('losses', scope) # Calculate the total loss for the the current tower total_loss = tf.add_n(losses, name='total_loss') # Compute the moving average of all individual losses and the total loss loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(losses + [total_loss]) # Attach a scalar summary to all individual losses and the total loss # Do the same for the average version of the losses for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is multi-GPU training # This helps the clarity of the presentation in tensorboard loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) # Name each loss as '(raw)' and name the moving average version of the loss tf.scalar_summary(loss_name + '(raw)', l) tf.scalar_summary(loss_name, loss_averages.average(l)) with tf.control_dependencies([loss_averages_op]): total_loss = tf.identity(total_loss) return total_loss
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.train.get_or_create_global_step() # Get images and labels for CIFAR-10. # Force input pipeline to CPU:0 to avoid operations sometimes ending up on # GPU and resulting in a slow down. with tf.device('/cpu:0'): # images:128x24x24x3 float32 labels:128 int32 images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. # logits:128x10 logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) """
def setup_model(self): # setup tensorflow model structure self.x_pl = tf.placeholder(tf.float32, shape=(None, 24, 24, 3), name='input_x') self.y_pl = tf.placeholder(tf.int32, shape=(None, ), name='output_y') self.lr_pl = tf.placeholder(tf.float32, shape=(), name='learning_rate') self.y_logits = cifar10.inference(self.x_pl) # construct model self.loss = cifar10.loss(self.y_logits, self.y_pl) self.y_pred = tf.cast(tf.argmax(self.y_logits, 1), tf.int32) self.correct_prediction = tf.equal(self.y_pred, self.y_pl) # used for accuracy self.num_correct = tf.reduce_sum( tf.cast(self.correct_prediction, tf.int64)) self.accuracy = tf.reduce_mean( tf.cast(self.correct_prediction, tf.float32)) self.optimizer = tf.train.GradientDescentOptimizer(self.lr_pl) self.train_op = self.optimizer.minimize(self.loss) self.opt_reset_op = tf.variables_initializer( self.optimizer.variables()) # import ipdb; ipdb.set_trace() # check self.optimizer.variables() self.metrics = { # used by self.eval() 'loss': self.loss, 'correct': self.num_correct, # and add more... } # transform ops self.x_tr_pl = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) # with tf.device('/cpu:0'): self.train_transform_op = train_transform(self.x_tr_pl) self.test_transform_op = test_transform(self.x_tr_pl)
def tower_loss(scope, images, labels): # Build inference Graph. logits = cifar10.inference(images, train=True) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = cifar10.loss(logits, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Attach a scalar summary to all individual losses and the total loss; do the # same for the averaged version of the losses. for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) tf.summary.scalar(loss_name, l) return total_loss
def tower_loss(scope): """Calculate the total loss on a single tower running the CIFAR model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' Returns: Tensor of shape [] containing the total loss for a batch of data """ # ref: https://github.com/tensorflow/models/issues/1264 with tf.device('/cpu:0'): # FIXME: TRICK # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build inference Graph. logits = cifar10.inference(images) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = cifar10.loss(logits, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Attach a scalar summary to all individual losses and the total loss; do the # same for the averaged version of the losses. for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) tf.summary.scalar(loss_name, l) return total_loss
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): with tf.variable_scope("model") as scope: global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. # Force input pipeline to CPU:0 to avoid operations sometimes ending up on # GPU and resulting in a slow down. with tf.device('/cpu:0'): images_train, labels_train = cifar10.distorted_inputs() images_test, labels_test = cifar10.inputs(eval_data=True) # Build a Graph that computes the logits predictions from the # inference model. logits_train = cifar10.inference(images_train) # Calculate loss. loss = cifar10.loss(logits_train, labels_train) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) with tf.variable_scope("model", reuse=True): logits_test = cifar10.inference(images_test) # For evaluation top_k = tf.nn.in_top_k(logits_train, labels_train, 1) top_k_test = tf.nn.in_top_k(logits_test, labels_test, 1) summary_train_prec = tf.placeholder( tf.float32) # summary writer for training data summary_test_prec = tf.placeholder( tf.float32) # summary writer for testing data tf.summary.scalar('accuracy/train', summary_train_prec) # train accuracy tf.summary.scalar('accuracy/test', summary_test_prec) # test accuracy model_saver = tf.train.Saver( tf.all_variables()) # save the model by creating checkpoint summary_op = tf.summary.merge_all() # merge all the summaries init = tf.initialize_all_variables() # init the variables # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) train_summary_writer = tf.summary.FileWriter( FLAGS.train_dir, sess.graph) for step in range(FLAGS.max_steps): # iterate through no of steps start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan( loss_value), 'Model diverged with loss = NaN' # output the step loss after 10 batches if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) prec_train = evaluate_set(sess, top_k, 1024) # get train accuracy prec_test = evaluate_set(sess, top_k_test, 1024) # get test accuracy print('%s: accuracy train = %.5f' % (datetime.now(), prec_train)) print('%s: accuracy test = %.5f' % (datetime.now(), prec_test)) print( "---------------------------------------------------------------------------------" ) # log the summary after every 100 steps if step % 100 == 0: summary = sess.run(summary_op, feed_dict={ summary_train_prec: prec_train, summary_test_prec: prec_test }) train_summary_writer.add_summary( summary, step ) # create summary for testing and training accuracy # save the model after 1000 steps if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: create_checkpoint = os.path.join(FLAGS.train_dir, 'model.ckpt') model_saver.save(sess, create_checkpoint, global_step=step)
def train(): ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') print ('PS hosts are: %s' % ps_hosts) print ('Worker hosts are: %s' % worker_hosts) server = tf.train.Server({'ps': ps_hosts, 'worker': worker_hosts}, job_name = FLAGS.job_name, task_index=FLAGS.task_id) if FLAGS.job_name == 'ps': os.environ['CUDA_VISIBLE_DEVICES'] = '' server.join() os.environ['CUDA_VISIBLE_DEVICES'] = '0' is_chief = (FLAGS.task_id == 0) if is_chief: if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) device_setter = tf.train.replica_device_setter(ps_tasks=len(ps_hosts)) with tf.device('/job:worker/task:%d' % FLAGS.task_id): with tf.device(device_setter): global_step = tf.Variable(0, trainable=False) decay_steps = 50000*350.0/FLAGS.batch_size batch_size = tf.placeholder(dtype=tf.int32, shape=(), name='batch_size') images, labels = cifar10.distorted_inputs(batch_size) logits = cifar10.inference(images, batch_size) loss = cifar10.loss(logits, labels, batch_size) lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE, global_step, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True) opt = tf.train.GradientDescentOptimizer(lr) exp_moving_averager = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=len(worker_hosts), total_num_replicas=len(worker_hosts), variable_averages=exp_moving_averager, variables_to_average=variables_to_average) naive_grads = opt.compute_gradients(loss) grads = [(tf.scalar_mul(tf.cast(batch_size/FLAGS.batch_size, tf.float32), grad), var) for grad, var in naive_grads] apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): train_op = tf.identity(loss, name='train_op') chief_queue_runners = [opt.get_chief_queue_runner()] init_tokens_op = opt.get_init_tokens_op() saver = tf.train.Saver() sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()), summary_op=None, global_step=global_step, saver=saver, recovery_wait_secs=1, save_model_secs=60) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) if is_chief: sv.start_queue_runners(sess, chief_queue_runners) sess.run(init_tokens_op) """Train CIFAR-10 for a number of steps.""" batch_size_num = FLAGS.batch_size for step in range(FLAGS.max_steps): start_time = time.time() run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() _, loss_value, gs = sess.run([train_op, loss, global_step], feed_dict={batch_size: batch_size_num}, options=run_options, run_metadata=run_metadata) duration = time.time() - start_time num_examples_per_step = batch_size_num examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f sec/batch)') tf.logging.info(format_str % (datetime.now(), step, gs, loss_value, examples_per_sec, sec_per_batch))
def train(): """Train a model for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for a segmentation model. images, labels, ground_truth = cifar10.distorted_inputs() tf.histogram_summary('label_hist/with_ignore', labels) tf.histogram_summary('label_hist/ground_truth', ground_truth) # Build a Graph that computes the logits predictions from the # inference model. print("before inference") print(images.get_shape()) logits, nr_params = cifar10.inference(images) print("nr_params: "+str(nr_params) ) print("after inference") # Calculate loss. loss = cifar10.loss(logits, labels) accuracy, precision, cat_accs = cifar10.accuracy(logits, ground_truth) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # tf.image_summary('images2', images) print (logits) # tf.image_summary('predictions', logits) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) # Assuming model_checkpoint_path looks something like: # /my-favorite-path/cifar10_train/model.ckpt-0, # extract global_step from it. global_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) else: print('No checkpoint file found') print('Initializing new model') sess.run(init) global_step = 0 # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) for step in xrange(global_step, FLAGS.max_steps): start_time = time.time() _, loss_value, accuracy_value, precision_value, cat_accs_val = sess.run([train_op, loss, accuracy, precision, cat_accs]) duration = time.time() - start_time print (precision_value) print (cat_accs_val) assert not np.isnan(loss_value), 'Model diverged with loss = NaN' #precision_value = [0 if np.isnan(p) else p for p in precision_value] #print (precision_value) if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)\n Accuracy = %.4f, mean average precision = %.4f') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch, accuracy_value, np.mean(precision_value))) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) summary = tf.Summary() summary.value.add(tag='Accuracy (raw)', simple_value=float(accuracy_value)) for i,s in enumerate(CLASSES): summary.value.add(tag="precision/"+s+" (raw)",simple_value=float(precision_value[i])) summary.value.add(tag="accs/"+s+" (raw)",simple_value=float(cat_accs_val[i])) # summary.value.add(tag='Human precision (raw)', simple_value=float(precision_value)) summary_writer.add_summary(summary, step) print("hundred steps") # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: print("thousand steps") checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(): """Train CIFAR-10 for a number of steps.""" #print '---------' ps_spec = FLAGS.ps_hosts.split(',') worker_spec = FLAGS.worker_hosts.split(',') issync = FLAGS.sync num_worker = len(worker_spec) cluster = tf.train.ClusterSpec({'ps': ps_spec, 'worker': worker_spec}) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) print("number of workers:%d" % num_worker) if FLAGS.job_name == 'ps': server.join() elif FLAGS.job_name == "worker": time.sleep(10) is_chief = (FLAGS.task_index == 0) # worker_device = '/job:worker/task%d/cpu:0' % FLAGS.task_index with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster)): #with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Calculate the learning rate schedule. num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE, global_step, decay_steps, cifar10.LEARNING_RATE_DECAY_FACTOR, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.GradientDescentOptimizer(lr) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * num_worker) # Calculate the gradients for each model tower. ## tower_grads = [] ## with tf.variable_scope(tf.get_variable_scope()): # for i in xrange(FLAGS.num_gpus): # with tf.device('/gpu:%d' % i): ## with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, FLAGS.task_index)) as scope: # Dequeues one batch for the GPU image_batch, label_batch = batch_queue.dequeue() # Calculate the loss for one tower of the CIFAR model. This function # constructs the entire CIFAR model but shares the variables across # all towers. ## loss = tower_loss(scope, image_batch, label_batch,tower_grads) logits = cifar10.inference(image_batch) loss = cifar10.loss(logits, label_batch) # Reuse variables for the next tower. ## tf.get_variable_scope().reuse_variables() # Retain the summaries from the final tower. ## summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) summaries = tf.get_collection(tf.GraphKeys.SUMMARIES) # Calculate the gradients for the batch of data on this CIFAR tower. grads = opt.compute_gradients(loss) # Keep track of the gradients across all towers. ## tower_grads.append(grads) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. ## grads = average_gradients(tower_grads) # Add a summary to track the learning rate. summaries.append(tf.summary.scalar('learning_rate', lr)) # Add histograms for gradients. ## for grad, var in grads: ## if grad is not None: ## summaries.append(tf.summary.histogram(var.op.name + '/gradients', grad)) # added by faye if issync == 1: syn_opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=num_worker, #replica_id=FLAGS.task_index, total_num_replicas=num_worker) #use_locking=True) # Apply the gradients to adjust the shared variables. apply_gradient_op = syn_opt.apply_gradients( grads, global_step=global_step) # Newly added if is_chief: local_init_op = syn_opt.chief_init_op else: local_init_op = syn_opt.local_step_init_op ready_for_local_init_op = syn_opt.ready_for_local_init_op init_token_op = syn_opt.get_init_tokens_op() chief_queue_runner = syn_opt.get_chief_queue_runner() else: apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.summary.histogram(var.op.name, var)) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( cifar10.MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) # Group all updates to into a single train op. train_op = tf.group(apply_gradient_op, variables_averages_op) # Create a saver. saver = tf.train.Saver(tf.global_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.summary.merge(summaries) # Build an initialization operation to run below. # init = tf.global_variables_initializer() init_op = tf.initialize_all_variables() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. #sess = tf.Session(config=tf.ConfigProto( # allow_soft_placement=True, # log_device_placement=FLAGS.log_device_placement)) #sess = tf.Session("grpc://%s" % FLAGS.ps_hosts) #sess.run(init) if issync == 1: sv = tf.train.Supervisor( is_chief=is_chief, logdir=FLAGS.train_dir, init_op=init_op, local_init_op=local_init_op, ready_for_local_init_op=ready_for_local_init_op, summary_op=summary_op, saver=saver, global_step=global_step, save_model_secs=600) else: sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=init_op, summary_op=summary_op, saver=saver, global_step=global_step, save_model_secs=600) sess = sv.prepare_or_wait_for_session(server.target) # sess = tf.train.MonitoredTrainingSession(master=server.target, # is_chief=is_chief) # Start the queue runners. Modified by faye if is_chief and issync == 1: sess.run(init_token_op) # tf.train.start_queue_runners(sess, [chief_queue_runner]) sv.start_queue_runners(sess, [chief_queue_runner]) else: sv.start_queue_runners(sess=sess) # tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) step = 0 g_step = 0 while g_step <= FLAGS.max_steps: # for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time g_step = int(tf.train.global_step(sess, global_step)) print('worker %d: step %d, global step %d: loss = %.2f' % (FLAGS.task_index, step, g_step, loss_value)) assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size * num_worker examples_per_sec = num_examples_per_step / duration sec_per_batch = duration / num_worker format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch, global_step %d)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch, g_step)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. #if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: # checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') # saver.save(sess, checkpoint_path, global_step=step) step += 1 sv.stop()
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # # Visualize conv1 features # with tf.variable_scope('conv1') as scope_conv: # #tf.get_variable_scope().reuse_variables() # scope_conv.reuse_variables() # weights = tf.get_variable('weights') # grid_x = grid_y = 8 # to get a square grid for 64 conv1 features # grid = put_kernels_on_grid (weights, (grid_y, grid_x)) # tf.image_summary('conv1/features', grid, max_images=1) # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, graph_def=sess.graph_def) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / float(duration) sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def main_fun(argv, ctx): import tensorflow as tf import cifar10 sys.argv = argv FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string('train_dir', '/tmp/cifar10_train', """Directory where to write event logs """ """and checkpoint.""") tf.app.flags.DEFINE_integer('max_steps', 1000000, """Number of batches to run.""") tf.app.flags.DEFINE_boolean('log_device_placement', False, """Whether to log device placement.""") tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""") # cifar10.maybe_download_and_extract() if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma) # Train CIFAR-10 for a number of steps. with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 def before_run(self, run_context): self._step += 1 self._start_time = time.time() return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): duration = time.time() - self._start_time loss_value = run_values.results if self._step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook()], config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def train(): # # debug # true_classes = np.ndarray(shape=(FLAGS.batch_size, 1), dtype=int) # true_classes.fill(2) # # Create a pair of constant ops, add the numpy # # array matrices. # true_classes_tf_matrix = tf.constant(true_classes, dtype=tf.int64) # # playing with introducing the sampler # classes_sampler = tf.nn.learned_unigram_candidate_sampler( # true_classes_tf_matrix, # 1, # true_classes # 5, # num_sampled # False, # unique # 10, # range_max # seed=None, # name="my_classes_sampler") # # print(classes_sampler) # # print("debug") # # print(classes_sampler.set_sampler) # # exit() """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # print("images") # print(images) # images = tf.Print(images, [images]) # print() # print(images[1]) print("------------------- train calling interference ---------------------") print(cifar10.__file__) # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) for step in xrange(FLAGS.max_steps): # manually load the contents of images and labels # before calling this sess.run() # 1. have Cifar10 dataset in memory # 2. create a mini-batch # 3. set the placeholders/vars to the the mini-batch data # 4. run one forward-backward step # print("training step: " + str(step)) start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) # debug, temp change, go back to the one below summary_str = sess.run(summary_op) # print("summary: " + summary_str) summary_writer.add_summary(summary_str, step) summary_writer.flush() # if step % 100 == 0: # summary_str = sess.run(summary_op) # # print("summary: " + summary_str) # summary_writer.add_summary(summary_str, step) # summary_writer.flush() # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. if tfFLAGS.network == 1: images, labels = cifar10.distorted_inputs() logits, fc1_w, fc1_b, fc2_w, fc2_b = MyModel.inference(images) else: images, labels = cifar10.distorted_inputs() logits, fc1_w, fc1_b, fc2_w, fc2_b = MyModel2.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # L2 regularization for the fully connected parameters. regularizers = (tf.nn.l2_loss(fc1_w) + tf.nn.l2_loss(fc1_b) + tf.nn.l2_loss(fc2_w) + tf.nn.l2_loss(fc2_b)) # Add the regularization term to the loss. loss += 5e-4 * regularizers # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % tfFLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = tfFLAGS.log_frequency * tfFLAGS.batch_size / duration sec_per_batch = float(duration / tfFLAGS.log_frequency) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print_(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) texts = ['conv1:', 'conv1Biases:', 'conv2:', 'conv2Biases:', 'local3:', 'local3Biases:', 'local4:', 'local4Biases:', 'softmax:', 'softmaxBiases:'] total_parameters = 0; count = 0 for variable in tf.trainable_variables(): variable_parametes = 1 for dim in variable.get_shape(): variable_parametes *= dim.value print('Number of hidden parameters of ' + texts[count], variable_parametes) total_parameters += variable_parametes count += 1 print('Total Number of hidden parameters:', total_parameters) with tf.train.MonitoredTrainingSession(checkpoint_dir=tfFLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=tfFLAGS.max_steps), tf.train.NanTensorHook(loss),_LoggerHook()], config=tf.ConfigProto( device_count = {'GPU': 0}, log_device_placement=tfFLAGS.log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def main(_): class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 def before_run(self, run_context): self._step += 1 self._start_time = time.time() return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): duration = time.time() - self._start_time loss_value = run_values.results if self._step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) ps_hosts = FLAGS.ps_hosts.split(",") worker_hosts = FLAGS.worker_hosts.split(",") # Create a cluster from the parameter server and worker hosts. cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) # Create and start a server for the local task. server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() elif FLAGS.job_name == "worker": # Assigns ops to the local worker by default. with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster)): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build inference Graph. logits = cifar10.inference(images) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss,global_step) # The StopAtStepHook handles stopping after running given steps. hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), _LoggerHook()] # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession(master=server.target, is_chief=(FLAGS.task_index == 0), checkpoint_dir=FLAGS.train_dir, save_checkpoint_secs=60, hooks=hooks) as mon_sess: while not mon_sess.should_stop(): # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # mon_sess.run handles AbortedError in case of preempted PS. mon_sess.run(train_op)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. # images, labels = cifar10.standard_distorted_inputs() inputs = cifar10.ram_inputs(unit_variance=True, is_train=True) images = inputs['images'] labels = inputs['labels'] # Batch generator batcher = cifar10.Cifar10BatchGenerator( inputs['data_images'], inputs['data_labels'], True, FLAGS.max_epochs) # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images, 3, use_batchnorm=True, use_nrelu=False, id_decay=False, add_shortcuts=True, is_train=True) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement, gpu_options=gpu_options)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) step = -1 while not batcher.is_done(): step += 1 batch_im, batch_labs = batcher.next_batch() feed_dict = { inputs['images_pl']: batch_im, inputs['labels_pl']: batch_labs, } start_time = time.time() _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 10 == 0: summary_str = sess.run(summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 10 == 0 or batcher.is_done(): checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)