def build_model(x, y_, n_workers, is_chief): regularizer = tf.contrib.layers.l2_regularizer(REGULARAZTION_RATE) y = mnist_inference.inference(x, regularizer) global_step = tf.contrib.framework.get_or_create_global_step() #计算损失函数并定义反向传播过程 cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=tf.argmax(y_, 1), logits=y) cross_entropy_mean = tf.reduce_mean(cross_entropy) loss = cross_entropy_mean + tf.add_n(tf.get_collection("losses")) learning_rate = tf.train.exponential_decay(LEARNING_RATE_BASE, global_step, 60000 / BATCH_SIZE, LEARNING_RATE_DECAY) #通过tf.train.SyncReplicasOptimizer函数实现同步更新 opt = tf.train.SyncReplicasOptimizer( tf.train.GradientDescentOptimizer(learning_rate), replicas_to_aggregate=n_workers, total_num_replicas=n_workers) sync_replicas_hook = opt.make_session_run_hook(is_chief) train_op = opt.minimize(loss, global_step=global_step) #定义每一轮迭代需要运行的操作 if is_chief: #定义变量的滑动平均值 variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variable_averages_op = variable_averages.apply( tf.trainable_variables()) with tf.control_dependencies([variable_averages_op, train_op]): train_op = tf.no_op() return global_step, loss, train_op, sync_replicas_hook
def train(mnist): #将处理输入数据的计算都放在名字为“input”的命名空间下。 with tf.name_scope("input"): x = tf.placeholder(dtype=tf.float32, shape=[None, mnist_inference.INPUT_NODES], name="x-input") y_ = tf.placeholder(dtype=tf.float32, shape=[None, mnist_inference.OUTPUT_NODES], name="y-input") regularizer = tf.contrib.layers.l2_regularizer(REGULARIZER_RATE) y = mnist_inference.inference(x, regularizer) global_step = tf.Variable(0, trainable=False) #将处理滑动平均相关的计算都放在名为moving_average的命名空间下 with tf.name_scope("moving_average"): variable_average = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variable_average_op = variable_average.apply(tf.trainable_variables()) # 将处理损失函数相关的计算都放在名为loss_function的命名空间下 with tf.name_scope("loss_function"): cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=y, labels=tf.argmax(y_, 1)) cross_entropy_mean = tf.reduce_mean(cross_entropy) loss = cross_entropy_mean + tf.add_n(tf.get_collection("losses")) # 将定义学习率、优化方法以及每一轮训练需要执行的操作都放在名为train_step的命名空间下 with tf.name_scope("train_step"): learning_rate = tf.train.exponential_decay( LEARNING_RATE_BASE, global_step, mnist.train.num_examples / BATCH_SIZE, LEARNING_RATE_DECAY) train_steps = tf.train.GradientDescentOptimizer( learning_rate).minimize(loss, global_step=global_step) with tf.control_dependencies([train_steps, variable_average_op]): train_op = tf.no_op(name="train") with tf.Session() as sess: tf.global_variables_initializer().run() for i in range(TRAINING_STEPS): xs, ys = mnist.train.next_batch(BATCH_SIZE) _, loss_value, step, learn = sess.run( [train_op, loss, global_step, learning_rate], feed_dict={ x: xs, y_: ys }) if i % 1000 == 0: print("After %d training step(s),loss on training " "batch is %g , learning is %g." % (step, loss_value, learn)) #计算MNIST测试数据对应的输出层矩阵 final_result = sess.run(y, feed_dict={x: mnist.test.images}) #返回输出层矩阵的值 return final_result
def get_loss(x, y_, regularizer, scope, reuse_variable=None): with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variable): y = mnist_inference.inference(x, regularizer) #计算交叉熵损失 cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=y_)) #计算当前GPU上计算得到的正则化损失 regularization_loss = tf.add_n(tf.get_collection("loss", scope)) #计算最终的总损失 loss = cross_entropy + regularization_loss return loss
def train(mnist): x = tf.placeholder(dtype=tf.float32, shape=[None, mnist_inference.INPUT_NODES], name="x-input") y_ = tf.placeholder(dtype=tf.float32, shape=[None, mnist_inference.OUTPUT_NODES], name="y-input") regularizer = tf.contrib.layers.l2_regularizer(REGULARIZER_RATE) #直接使用mnist_inference中定义的前向传播过程。 y = mnist_inference.inference(x, regularizer) global_step = tf.Variable(0, trainable=False) #定义损失函数、滑动平均模型、指数学习率。 variable_average = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variable_average_op = variable_average.apply(tf.trainable_variables()) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=y, labels=tf.argmax(y_, 1)) cross_entropy_mean = tf.reduce_mean(cross_entropy) #用loss的平均交叉熵加正则化 loss = cross_entropy_mean + tf.add_n(tf.get_collection("losses")) learning_rate = tf.train.exponential_decay( LEARNING_RATE_BASE, global_step, mnist.train.num_examples / BATCH_SIZE, LEARNING_RATE_DECAY) train_steps = tf.train.GradientDescentOptimizer(learning_rate).minimize( loss, global_step=global_step) with tf.control_dependencies([train_steps, variable_average_op]): train_op = tf.no_op(name="train") #初始化Tensorflow持久化类 saver = tf.train.Saver() with tf.Session() as sess: tf.global_variables_initializer().run() #在训练过程中不在测试模型在验证数据上的表现,验证和测试的过程将会有一个独立的程序来完成 for i in range(TRAINING_STEPS): xs, ys = mnist.train.next_batch(BATCH_SIZE) _, loss_value, step, learn = sess.run( [train_op, loss, global_step, learning_rate], feed_dict={ x: xs, y_: ys }) #每1000轮保存一次模型 if i % 1000 == 0: #输出当前的训练情况,这里只输出了模型在当前训练batch上的损失函数大小,通过损失函数的大小可以大概了解训练的情况,在验证数据集上的正确率信息会有一个单独的程序来生成 print("After %d training step(s),loss on training " "batch is %g , learning is %g." % (step, loss_value, learn)) #保存当前的模型,注意这里给出了global_step参数,这样可以让每个被保存模型的文件名末尾加上训练的轮数,比如"model.ckpt-1000"表示训练1000轮之后得到的模型 saver.save(sess, os.path.join(MODEL_SAVE_PATH, MODEL_NAME), global_step=global_step)
def evaluate(mnist): with tf.Graph().as_default() as g: x = tf.placeholder(dtype=tf.float32, shape=[None, mnist_inference.INPUT_NODES], name="x-input") y_ = tf.placeholder(dtype=tf.float32, shape=[None, mnist_inference.OUTPUT_NODES], name="y-input") validation_feed = { x: mnist.validation.images, y_: mnist.validation.labels } # 直接通过调用封装好的函数来计算前向传播的结果。因为测试时不关注正则化损失的值,所以这里用于计算正则化损失的函数设置为None y = mnist_inference.inference(x, None) # 计算正确率 correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) # 通过变量重命名的方式来加载模型,这样在前向传播的过程中就不需要调用求滑动平均的函数来获取平均值了。这样就可以完全共用mnist_inference中定义的前向传播过程 variable_average = tf.train.ExponentialMovingAverage( mnist_train.MOVING_AVERAGE_DECAY) variable_to_restore = variable_average.variables_to_restore() saver = tf.train.Saver(variable_to_restore) # 每隔EVAL_INTERVAL_SIZE秒调用一次计算正确率的过程以检测训练过程中正确率的变化 while True: with tf.Session() as sess: # tf.train.get_checkpoint_state函数会通过checkpoint文件自动找到目录中最新模型的文件名 ckpt = tf.train.get_checkpoint_state( mnist_train.MODEL_SAVE_PATH) print("ckpt is", ckpt) if ckpt and ckpt.model_checkpoint_path: # 加载模型 saver.restore(sess, ckpt.model_checkpoint_path) # 通过文件名得到模型保存时迭代的轮数 global_step = ckpt.model_checkpoint_path.split( '/')[-1].split('-')[-1] print("global_step", global_step) accuracy_score = sess.run(accuracy, feed_dict=validation_feed) print( "After %s training step(s), validation accuracy = %g " % (global_step, accuracy_score)) else: print("No checkpoint file found") return time.sleep(EVAL_INTERVAL_SIZE)
def train(mnist, path): #将处理输入数据的计算都放在名字为“input”的命名空间下。 with tf.name_scope("input"): x = tf.placeholder(dtype=tf.float32, shape=[None, mnist_inference.INPUT_NODES], name="x-input") y_ = tf.placeholder(dtype=tf.float32, shape=[None, mnist_inference.OUTPUT_NODES], name="y-input") regularizer = tf.contrib.layers.l2_regularizer(REGULARIZER_RATE) y = mnist_inference.inference(x, regularizer) global_step = tf.Variable(0, trainable=False) #将处理滑动平均相关的计算都放在名为moving_average的命名空间下 with tf.name_scope("moving_average"): variable_average = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variable_average_op = variable_average.apply(tf.trainable_variables()) # 将处理损失函数相关的计算都放在名为loss_function的命名空间下 with tf.name_scope("loss_function"): cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=y, labels=tf.argmax(y_, 1)) cross_entropy_mean = tf.reduce_mean(cross_entropy) loss = cross_entropy_mean + tf.add_n(tf.get_collection("losses")) # 将定义学习率、优化方法以及每一轮训练需要执行的操作都放在名为train_step的命名空间下 with tf.name_scope("train_step"): learning_rate = tf.train.exponential_decay( LEARNING_RATE_BASE, global_step, mnist.train.num_examples / BATCH_SIZE, LEARNING_RATE_DECAY) train_steps = tf.train.GradientDescentOptimizer( learning_rate).minimize(loss, global_step=global_step) with tf.control_dependencies([train_steps, variable_average_op]): train_op = tf.no_op(name="train") writer = tf.summary.FileWriter(path, tf.get_default_graph()) with tf.Session() as sess: tf.global_variables_initializer().run() for i in range(TRAINING_STEPS): xs, ys = mnist.train.next_batch(BATCH_SIZE) if i % 1000 == 0: #配置运行时需要记录的信息 run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) #运行时记录运行信息的proto run_metadata = tf.RunMetadata() _, loss_value, step, learn = sess.run( [train_op, loss, global_step, learning_rate], feed_dict={ x: xs, y_: ys }, options=run_options, run_metadata=run_metadata) #'step%03d' % i 会在session runs中显示 writer.add_run_metadata(run_metadata, 'step%03d' % i) print("After %d training step(s),loss on training " "batch is %g , learning is %g." % (step, loss_value, learn)) else: _, loss_value, step, learn = sess.run( [train_op, loss, global_step, learning_rate], feed_dict={ x: xs, y_: ys }) writer.close()
img_data = preprocess_case.preprocess_for_train(img_data, 1, image_size, None) img_data = tf.reshape(img_data, [1, 784, 3]) #5.整理成batch作为神经网络输入 batch_size = 3 min_after_dequeue = 10000 capacity = min_after_dequeue + 3 * batch_size img_batch, label_batch = tf.train.shuffle_batch( [img_data, labels], batch_size=batch_size, min_after_dequeue=min_after_dequeue, capacity=capacity) #定义神经网络结构 img_batch = tf.reshape(img_batch, [9, 784]) y = mnist_inference.inference(img_batch, None) global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.exponential_decay(0.8, global_step, 3, 0.99) label_batch = tf.reshape(label_batch, [9, 1]) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=y, labels=tf.argmax(label_batch, 1)) loss = tf.reduce_mean(cross_entropy) train_steps = tf.train.GradientDescentOptimizer(learning_rate).minimize( loss, global_step=global_step) with tf.Session() as sess: tf.local_variables_initializer().run() tf.global_variables_initializer().run() coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) for i in range(1000):