def train(): filenames = tf.placeholder(tf.string, [None]) dataset = tf.data.TFRecordDataset(filenames) dataset = dataset.map(mnist.parse_data) dataset = dataset.shuffle(buffer_size=50000) dataset = dataset.batch(FLAGS.batch_size) dataset = dataset.repeat() iterator = dataset.make_initializable_iterator() global_step = tf.train.get_or_create_global_step() images, labels = iterator.get_next() logits, pred = mnist.inference(images, training=True) loss = mnist.loss(logits, labels) train_op = mnist.train(loss, global_step) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_step), tf.train.NanTensorHook(loss)], save_checkpoint_steps=100 ) as mon_sess: mon_sess.run(iterator.initializer, feed_dict={filenames: ['train_img.tfrecords']}) while not mon_sess.should_stop(): _, train_loss, train_step, label = mon_sess.run([train_op, loss, global_step, labels]) if train_step % 100 == 0: print('step: {}, loss: {}'.format(train_step, train_loss))
def go_svm(proc, pca_enabled, central): print("SVM") print("0-1", proc) # only 0 and 1 print("Central", central) # 32x32 print("PCA", pca_enabled) # PCA to 50 dims train_x, train_y = mnist.train() test_x, test_y = mnist.test() if central: train_x = mnist.train_32() test_x = mnist.test_32() if proc: with Timer("process"): train_x = process(train_x) test_x = process(test_x) train_x = train_x.reshape((train_x.shape[0], -1)) test_x = test_x.reshape((test_x.shape[0], -1)) if pca_enabled: with Timer("PCA"): pca = PCA(n_components=50, whiten=True) train_x = pca.fit_transform(train_x) test_x = pca.transform(test_x) with Timer("train"): clf = svm.SVC(cache_size=7000) clf.fit(train_x, train_y) print("Accuracy:", clf.score(test_x, test_y))
def main(num_epochs=NUM_EPOCHS): print("Loading data...") dataset = load_data() print("Building model and compiling functions...") output_layer = build_model( input_height=dataset["input_height"], input_width=dataset["input_width"], output_dim=dataset["output_dim"] ) iter_funcs = create_iter_functions(dataset, output_layer, X_tensor_type=T.tensor4) print("Starting training...") now = time.time() try: for epoch in train(iter_funcs, dataset): print("Epoch {} of {} took {:.3f}s".format(epoch["number"], num_epochs, time.time() - now)) now = time.time() print(" training loss:\t\t{:.6f}".format(epoch["train_loss"])) print(" validation loss:\t\t{:.6f}".format(epoch["valid_loss"])) print(" validation accuracy:\t\t{:.2f} %%".format(epoch["valid_accuracy"] * 100)) if epoch["number"] >= num_epochs: break except KeyboardInterrupt: pass return output_layer
def train(baseModel, output_model_path, epochs=1): data = get_training_data() output = os.path.join("/repos", output_model_path, 'weights.tar') logging.info(f'input path: [{baseModel.path}]') logging.info(f'output path: [{output}]') logging.info(f'epochs: {epochs}') base_weight_path = os.path.join("/repos", baseModel.path, "weights.tar") try: metrics = mnist.train(data, output, epochs=epochs, resume=base_weight_path) except Exception as err: print(err) # Send finish message logging.info(f"GRPC_CLIENT_URI: {OPERATOR_URI}") try: channel = grpc.insecure_channel(OPERATOR_URI) stub = service_pb2_grpc.EdgeOperatorStub(channel) result = service_pb2.LocalTrainResult(error=0, datasetSize=2500, metrics=metrics) response = stub.LocalTrainFinish(result) except grpc.RpcError as rpc_error: logging.error("grpc error: {}".format(rpc_error)) except Exception as err: logging.error('got error: {}'.format(err)) logging.debug( "sending grpc message succeeds, response: {}".format(response))
def go(pca_enabled=False, centralize=False): print("PCA:", pca_enabled) print("Centralize:", centralize) train_x, train_y = mnist.train() test_x, test_y = mnist.test() if centralize: train_x = mnist.train_32() test_x = mnist.test_32() train_x = train_x.reshape((train_x.shape[0], -1)) test_x = test_x.reshape((test_x.shape[0], -1)) if pca_enabled: with Timer("PCA"): pca = PCA(n_components=50, whiten=True) train_x = pca.fit_transform(train_x) test_x = pca.transform(test_x) with Timer("train"): max_iter = 1000 if centralize or pca_enabled else 200 clf = MLPClassifier(max_iter=max_iter, verbose=True) clf.fit(train_x, train_y) print("Accuracy:", clf.score(test_x, test_y))
def train(): images, labels = mnist.inputs(['train_img.tfrecords'], mnist.TRAIN_EXAMPLES_NUM, FLAGS.batch_size, shuffle=True) global_step = tf.train.get_or_create_global_step() logits, pred = mnist.inference(images, training=True) loss = mnist.loss(logits, labels) train_op = mnist.train(loss, global_step) saver = tf.train.Saver() with tf.Session() as sess: init_op = tf.group( tf.local_variables_initializer(), tf.global_variables_initializer()) sess.run(init_op) ckpt = os.path.join(FLAGS.train_dir, 'model.ckpt') coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess, coord=coord) for i in range(1, FLAGS.max_step + 1): _, train_loss, predict, label = sess.run([train_op, loss, pred, labels]) # print(predict, '\n', label) if i % 100 == 0: print('step: {}, loss: {}'.format(i, train_loss)) # print(predict, '\n', label) saver.save(sess, ckpt, global_step=i) coord.request_stop() coord.join(threads)
def main(): images, labels = inputs() reshaped_images = tf.reshape(images, [ mnist.BATCH_SIZE, mnist.IMAGE_HEIGHT, mnist.IMAGE_WIDTH, mnist.IMAGE_DEPTH ]) logits = mnist.inference(reshaped_images) loss = mnist.loss(logits, labels) accuracy = mnist.accuracy(logits, labels) train_op = mnist.train(loss) init = tf.initialize_all_variables() with tf.Session() as sess: sess.run(init) for index in range(NUM_STEPS): batch_x, batch_y = mnist_data.train.next_batch(mnist.BATCH_SIZE) _, loss_value = sess.run([train_op, loss], feed_dict={ images: batch_x, labels: batch_y }) print("step:" + str(index + 1) + " loss: " + str(loss_value)) if (index + 1) % 10 == 0: validation_x, validation_y = mnist_data.validation.next_batch( mnist.BATCH_SIZE) accuracy_score = sess.run(accuracy, feed_dict={ images: validation_x, labels: validation_y }) print("accuracy : " + str(accuracy_score))
def main(num_epochs=NUM_EPOCHS): dataset = load_data() output_layer = build_model( input_height=dataset['input_height'], input_width=dataset['input_width'], output_dim=dataset['output_dim'], ) iter_funcs = create_iter_functions( dataset, output_layer, X_tensor_type=T.tensor4, ) print("Starting training...") for epoch in train(iter_funcs, dataset): print("Epoch %d of %d" % (epoch['number'], num_epochs)) print(" training loss:\t\t%.6f" % epoch['train_loss']) print(" validation loss:\t\t%.6f" % epoch['valid_loss']) print(" validation accuracy:\t\t%.2f %%" % (epoch['valid_accuracy'] * 100)) if epoch['number'] >= num_epochs: break
def go(proc, central): print("kNN") print("0-1", proc) print("central", central) train_x, train_y = mnist.train() test_x, test_y = mnist.test() if central: train_x = mnist.train_32() test_x = mnist.test_32() if proc: with Timer("process"): train_x = process(train_x) test_x = process(test_x) train_x = train_x.reshape((train_x.shape[0], -1)) test_x = test_x.reshape((test_x.shape[0], -1)) with Timer("kNN fit"): neigh = KNeighborsClassifier(n_neighbors=5, n_jobs=-1) neigh.fit(train_x, train_y) with Timer("kNN test"): print("Accuracy:", neigh.score(test_x, test_y))
def main(): fraction = 0.4 min_after_dequeue = int(mnist.NUM_EXAMPLES_PER_EPOCH * fraction) images, labels = mnist_inputs(min_after_dequeue) validation_images, validation_labels = mnist_inputs(2000, train=False, num_epochs=None) with tf.variable_scope("inference") as scope: logits = mnist.inference(images) scope.reuse_variables() validation_logits = mnist.inference(validation_images) loss = mnist.loss(logits, labels) tf.scalar_summary("cross_entropy", loss) accuracy = mnist.accuracy(validation_logits, validation_labels) tf.scalar_summary("validation_accuracy", accuracy) train_op = mnist.train(loss) sess = tf.Session() sess.run(tf.initialize_local_variables()) sess.run(tf.initialize_all_variables()) tf.train.start_queue_runners(sess=sess) merge = tf.merge_all_summaries() writer = tf.train.SummaryWriter( "/home/windows98/PycharmProjects/mnist/Summary/") for index in range(NUM_STEPS): _, loss_value, summary = sess.run([train_op, loss, merge]) writer.add_summary(summary, index + 1) # accuracy_score, summary = sess.run([accuracy, summary]) # writer.add_summary(summary, index+1) print("step:" + str(index + 1) + " loss: " + str(loss_value))
def main(num_epochs=NUM_EPOCHS): dataset = load_data() output_layer = build_model( input_width=dataset['input_width'], input_height=dataset['input_width'], output_dim=dataset['output_dim'], ) iter_funcs = create_iter_functions( dataset, output_layer, X_tensor_type=T.tensor4, ) print("Starting training...") for epoch in train(iter_funcs, dataset): print("Epoch %d of %d" % (epoch['number'], num_epochs)) print(" training loss:\t\t%.6f" % epoch['train_loss']) print(" validation loss:\t\t%.6f" % epoch['valid_loss']) print(" validation accuracy:\t\t%.2f %%" % (epoch['valid_accuracy'] * 100)) if epoch['number'] >= num_epochs: break return output_layer
def main(argv): args = parser.parse_args(argv[1:]) width = 45 train_x, train_y = mnist.train() test_x, test_y = mnist.test() if args.central: print("Use centralize now") width = 32 train_x = mnist.train_32() test_x = mnist.test_32() train_x = train_x / 256 test_x = test_x / 256 train_y = train_y.astype(np.int32) test_y = test_y.astype(np.int32) mnist_classifier = tf.estimator.Estimator(model_fn=cnn_model_fn, params={"width": width}) train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": train_x}, y=train_y, batch_size=100, num_epochs=None, shuffle=True) mnist_classifier.train(input_fn=train_input_fn, steps=10000) eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": test_x}, y=test_y, num_epochs=1, shuffle=False) eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) print(eval_results)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.train.get_or_create_global_step() # Get images and labels for CIFAR-10. # Force input pipeline to CPU:0 to avoid operations sometimes ending up on # GPU and resulting in a slow down. with tf.device('/cpu:0'): images, labels = mnist.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = mnist.inference(images) # Calculate loss. loss = mnist.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = mnist.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook() ], config=tf.ConfigProto(log_device_placement=FLAGS. log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def train(): with tf.Graph().as_default(): global_step = tf.train.get_or_create_global_step() with tf.device('/cpu:0'): images, labels = mnist.distorted_inputs() print(global_step) #with tf.device('/gpu:0'): logits = mnist.inference(images) #with tf.device('/gpu:0'): loss = mnist.loss(logits, labels) #with tf.device('/gpu:0'): train_op = mnist.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook() ], config=tf.ConfigProto(log_device_placement=FLAGS. log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def train_and_validation(): training_dataset = tf.data.TFRecordDataset(['./train_img.tfrecords']) validation_dataset = tf.data.TFRecordDataset(['./validation_img.tfrecords']) test_dataset = tf.data.TFRecordDataset(['./test_img.tfrecords']) training_dataset = training_dataset.map(mnist.parse_data) training_dataset = training_dataset.shuffle(50000).batch(FLAGS.batch_size).repeat() validation_dataset = validation_dataset.map(mnist.parse_data).batch(FLAGS.batch_size) test_dataset = test_dataset.map(mnist.parse_data).batch(FLAGS.batch_size) iterator = tf.data.Iterator.from_structure(output_types=training_dataset.output_types, output_shapes=training_dataset.output_shapes) training_init_op = iterator.make_initializer(training_dataset) validation_init_op = iterator.make_initializer(validation_dataset) test_init_op = iterator.make_initializer(test_dataset) images, labels = iterator.get_next() training = tf.placeholder(dtype=tf.bool) logits, pred = mnist.inference(images, training=training) loss = mnist.loss(logits, labels) top_k_op = tf.nn.in_top_k(logits, labels, 1) global_step = tf.train.get_or_create_global_step() train_op = mnist.train(loss, global_step) saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(training_init_op) print('begin to train!') ckpt = os.path.join(FLAGS.train_dir, 'model.ckpt') train_step = 0 while train_step < FLAGS.max_step: _, train_loss, step, label = sess.run([train_op, loss, global_step, labels], feed_dict={training: True}) train_step += 1 if train_step % 100 == 0: saver.save(sess, ckpt, train_step) if train_step % 1000 == 0: precision = evaluate(sess, top_k_op, training, mnist.TRAIN_EXAMPLES_NUM) print('step: {}, loss: {}, training precision: {}'.format(train_step, train_loss, precision)) sess.run(validation_init_op) precision = evaluate(sess, top_k_op, training, mnist.VALIDATION_EXAMPLES_NUM) print('step: {}, loss: {}, validation precision: {}'.format(train_step, train_loss, precision)) sess.run(training_init_op) sess.run(test_init_op) precision = evaluate(sess, top_k_op, training, mnist.TEST_EXAMPLES_NUM) print('finally test precision: {}'.format(precision))
def main(num_epochs=NUM_EPOCHS): print("Loading data...") dataset = load_data() print("Building model and compiling functions...") output_layer = build_model( input_height=dataset['input_height'], input_width=dataset['input_width'], output_dim=dataset['output_dim'], ) iter_funcs = create_iter_functions( dataset, output_layer, X_tensor_type=T.tensor4, ) print("Starting training...") now = time.time() try: for epoch in train(iter_funcs, dataset): print("Epoch {} of {} took {:.3f}s".format( epoch['number'], num_epochs, time.time() - now)) now = time.time() print(" training loss:\t\t{:.6f}".format(epoch['train_loss'])) print(" validation loss:\t\t{:.6f}".format(epoch['valid_loss'])) print(" validation accuracy:\t\t{:.2f} %%".format( epoch['valid_accuracy'] * 100)) if epoch['number'] % 100 is 0: #### save and load weights # save weights_save = lasagne.layers.get_all_param_values(output_layer) pickle.dump( weights_save, open( "weights_epoch%d.pkl" % epoch['number'], "wb" ) ) # load #weights_load = pickle.load( open( "weights.pkl", "rb" ) ) #lasagne.layers.set_all_param_values(output_layer, weights_load) if epoch['number'] >= num_epochs: break except KeyboardInterrupt: pass return output_layer
def main(): BATCH_SIZE = 100 MAX_PHRASE_LENGTH = 108 NUM_DATAPOINTS = 10000 print("Loading data...") dataset = load_data(n=NUM_DATAPOINTS, max_phrase_length=MAX_PHRASE_LENGTH) print("Building model and compiling functions...") output_layer = build_model( #input_height=dataset['input_height'], #input_width=dataset['input_width'], batch_size=BATCH_SIZE, max_phrase_length=MAX_PHRASE_LENGTH, output_dim=dataset['output_dim'], ) iter_funcs = create_iter_functions( dataset, output_layer, X_tensor_type=T.tensor3, batch_size=BATCH_SIZE, learning_rate=0.001, momentum=0.9, ) print("Starting training...") now = time.time() try: for epoch in train(iter_funcs, dataset, BATCH_SIZE): print("Epoch {} of {} took {:.3f}s".format( epoch['number'], num_epochs, time.time() - now)) now = time.time() print(" training loss:\t\t{:.6f}".format(epoch['train_loss'])) print(" validation loss:\t\t{:.6f}".format(epoch['valid_loss'])) print(" validation accuracy:\t\t{:.2f} %%".format( epoch['valid_accuracy'] * 100)) if epoch['number'] >= num_epochs: break except KeyboardInterrupt: pass return output_layer
def main(num_epochs=NUM_EPOCHS): print("Loading data...") dataset = load_data() print("Building model and compiling functions...") output_layer = build_model( input_height=dataset['input_height'], input_width=dataset['input_width'], output_dim=dataset['output_dim'], ) iter_funcs = create_iter_functions( dataset, output_layer, X_tensor_type=T.tensor4, ) print("Starting training...") now = time.time() try: for epoch in train(iter_funcs, dataset): print("Epoch {} of {} took {:.3f}s".format(epoch['number'], num_epochs, time.time() - now)) now = time.time() print(" training loss:\t\t{:.6f}".format(epoch['train_loss'])) print(" validation loss:\t\t{:.6f}".format(epoch['valid_loss'])) print(" validation accuracy:\t\t{:.2f} %%".format( epoch['valid_accuracy'] * 100)) if epoch['number'] >= num_epochs: break except KeyboardInterrupt: pass return output_layer
teacher_out = teacher.forward_pass(x) student.train(x, teacher_out) if __name__ == "__main__": teacher_layers = [ Layer(784, 16, LeakyReLU()), Layer(16, 16, LeakyReLU()), Layer(16, 10, LeakyReLU()), ] teacher_net = NeuralNetwork(teacher_layers, CrossEntropyLoss(), 0.001) train_data = load_data("mnistdata/mnist_train.csv", delimiter=",", dtype=int) train(teacher_net, train_data) test_data = load_data("mnistdata/mnist_test.csv", delimiter=",", dtype=int) accuracy = test(teacher_net, test_data) print(f"Accuracy of the teacher net is {100*accuracy:.2f}") student_layers = [ Layer(784, 10, Sigmoid()), ] student_net = NeuralNetwork(student_layers, MSELoss(), 0.005) train_student(student_net, teacher_net, train_data) student_accuracy = test(student_net, test_data) print(f"Accuracy of the student net is {100*accuracy:.2f}")
test_losses.append(test_loss) print('\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, 10000, 100. * correct / 10000)) return confusion network = network.float() test() optim_params = {} optim_params['lr'] = 0.0005 optim_params['momentum'] = .9 confusion = np.zeros((10, 10), dtype='i4') mnist.train(network, n_epochs=5, log_interval=100, optim_params=optim_params, batch_size=100) optim_params['lr'] = 0.0001 mnist.train(network, n_epochs=5, log_interval=100, optim_params=optim_params, batch_size=100) print(mnist.get_acc(network)) torch.save(network.state_dict(), 'model.pth') print(confusion) newFile = open('Data/CNN Data.txt', 'w') newFile.write(str(confusion))
import os import horovod.tensorflow as hvd import mnist if __name__ == '__main__': hvd.init() config = tf.ConfigProto() # Only use the GPU that horovod gives us config.gpu_options.visible_device_list = str(hvd.local_rank()) # Synchronize initial values across workers hvd.BroadcastGlobalVariablesHook(0).on_train_begin() ctx = mnist.Context() ctx.random_dim = 100 ctx.filters = 64 ctx.batch_size = 128 ctx.epochs = 51 ctx.opt = hvd.DistributedOptimizer( tf.keras.optimizers.Adam(lr=0.0002 * hvd.size(), beta_1=0.5)) mnist.load_data(ctx) ctx.generator = mnist.greate_dc_generator(ctx) ctx.discriminator = mnist.create_discriminator(ctx) ctx.gan = mnist.create_GAN(ctx) mnist.train(ctx)
def train(): """ 训练mnist网络 """ # with tf.Graph().as_default(): # global_step = tf.contrib.framework.get_or_create_global_step() global_step = tf.contrib.framework.get_or_create_global_step() #初始化所有参数 init = tf.global_variables_initializer() #获取(image, label)batch pair image_batch, label_batch = mnist.inputs('train') #损失函数sparse_softmax_cross_entropy_with_logits要求rank_of_labels = rank_of_images - 1 #对label_batch作扁平化处理 label_batch = tf.reshape(label_batch, [50]) #扩展image维度,从[batch, row, col]转换为[batch, row, col, depth=1] expand_image_batch = tf.expand_dims(image_batch, -1) #损失函数使用sparse_softmax_cross_entropy_with_logits(),自动完成one_hot编码转化 #将label数据由标量转换为one_hot编码形式 # labels_one_hot = dense_to_one_hot(label_batch, 10) #创建mnist模型,并计算每个batch样本的logits logits = mnist.inference(expand_image_batch, dropout=0.5) loss = mnist.loss(logits=logits, labels=label_batch) accuracy = mnist.train_accuracy(logits, label_batch) train_op = mnist.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """ 记录损失和运行时间日志信息 """ def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 # self._start_time = time.time() #请求目标tensor的值,在after_run方法中获取 return tf.train.SessionRunArgs([loss, accuracy]) def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: _current_time = time.time() duration = _current_time - self._start_time self._start_time = _current_time #提取before_run中请求的损失和精确度值 loss_value, accuracy_value = run_values.results #样本数/秒,秒/batch_size数样本 examples_per_sec = FLAGS.batch_size * FLAGS.log_frequency / duration sec_per_batch = float(duration / FLAGS.log_frequency) #console打印训练状态数据 #时间:步数,损失,精确度(每秒样本数,每batch样本处理时间) format_str = ( '%s: step %d, loss=%.2f, accuracy=%.2f(%.1f examples/sec, %.3f sec/batch)' ) print(format_str % (datetime.now(), self._step, loss_value, accuracy_value, examples_per_sec, sec_per_batch)) #最大训练步数20000,每10步打印一次输出 #MonitoredTrainingSession默认情况下600s保存一次检查点,每100步保存一次summary with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook() ], save_checkpoint_secs=60, config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) as mon_sess: mon_sess.run(init) while not mon_sess.should_stop(): # mon_sess.run(init) mon_sess.run(train_op)
def train(FLAGS, mnist_data, handler=None): class _Session: cancel = False session = _Session() if handler: def fire(typ, data=None): try: handler.fire(typ, data) except WebSocketClosedError: print('WebSocket closed error.') session.cancel = True def cancel(): session.cancel = True fire('cancel') def listener(message): if message == 'close' or message['event'] == 'cancel': cancel() handler.listen(listener) else: def fire(typ, data): pass with tf.Graph().as_default(): # Placeholders x = tf.placeholder(tf.float32, shape=[None, 784]) y = tf.placeholder(tf.float32, shape=[None, 10]) keep_prob = tf.placeholder(tf.float32) inference = mnist.inference(x, keep_prob) train = mnist.train(inference, y) accuracy = mnist.accuracy(inference, y) merge = tf.summary.merge_all() saver = tf.train.Saver(max_to_keep=200) print('Checkpoints directory: %s' % FLAGS.ckpt_dir) if tf.gfile.Exists(FLAGS.ckpt_dir): print('Cleaning checkpoints...') tf.gfile.DeleteRecursively(FLAGS.ckpt_dir) tf.gfile.MakeDirs(FLAGS.ckpt_dir) # Create Session sess = tf.Session() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(FLAGS.ckpt_dir, sess.graph) # Training and Evaluting print('Start training.') start = time.time() for step in range(1, 20000 + 1): if session.cancel: print('Interrupting training...') break batch = mnist_data.train.next_batch(50) if FLAGS.verbose: result = sess.run(inference, feed_dict={ x: batch[0], y: batch[1], keep_prob: 1.0 }) for i in range(50): t = np.argmax(batch[1][i]) i = np.argmax(result[i]) if t == i: sys.stdout.write("\033[94mx\033[0m") else: sys.stdout.write("\033[91mx\033[0m") sys.stdout.write("... ") if (step <= 300 and step % 10 == 0)\ or (300 < step and step <= 1000 and step % 100 == 0)\ or step % 200 == 0: test_inference, test_accuracy = sess.run( [inference, accuracy], feed_dict={ x: mnist_data.test.images, y: mnist_data.test.labels, keep_prob: 1.0 }) fire( 'test', { 'step': step, 'inference': test_inference.argmax(axis=1).tolist(), 'accuracy': str(test_accuracy) }) sys.stdout.write("===> Step %5d, Test Accuracy: %1.02f" % (step, test_accuracy)) if (step <= 100 and step % 10 == 0)\ or (100 < step and step <= 1000 and step % 100 == 0)\ or step % 1000 == 0: ckpt = saver.save(sess, os.path.join(FLAGS.ckpt_dir, 'ckpt'), global_step=step) sys.stdout.write(" @%s" % ckpt) fire('checkpoint', {'step': step}) print '' elif FLAGS.verbose: print '' _, summary = sess.run([train, merge], feed_dict={ x: batch[0], y: batch[1], keep_prob: 0.5 }) writer.add_summary(summary, global_step=step) elapsed_time = time.time() - start print('Total time: {0} [sec]'.format(elapsed_time)) print('Done!') return True
def run(): m = mnist.My_VAE_V2(10) mnist.train(m, 10) mnist.test(m) # will output dd.png return m