示例#1
0
def train():
    filenames = tf.placeholder(tf.string, [None])
    dataset = tf.data.TFRecordDataset(filenames)
    dataset = dataset.map(mnist.parse_data)
    dataset = dataset.shuffle(buffer_size=50000)
    dataset = dataset.batch(FLAGS.batch_size)
    dataset = dataset.repeat()

    iterator = dataset.make_initializable_iterator()

    global_step = tf.train.get_or_create_global_step()
    images, labels = iterator.get_next()
    logits, pred = mnist.inference(images, training=True)
    loss = mnist.loss(logits, labels)
    train_op = mnist.train(loss, global_step)

    with tf.train.MonitoredTrainingSession(
        checkpoint_dir=FLAGS.train_dir,
        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_step), tf.train.NanTensorHook(loss)],
        save_checkpoint_steps=100
    ) as mon_sess:
        mon_sess.run(iterator.initializer, feed_dict={filenames: ['train_img.tfrecords']})
        while not mon_sess.should_stop():
            _, train_loss, train_step, label = mon_sess.run([train_op, loss, global_step, labels])
            if train_step % 100 == 0:
                print('step: {}, loss: {}'.format(train_step, train_loss))
示例#2
0
def go_svm(proc, pca_enabled, central):
    print("SVM")
    print("0-1", proc)  # only 0 and 1
    print("Central", central)  # 32x32
    print("PCA", pca_enabled)  # PCA to 50 dims

    train_x, train_y = mnist.train()
    test_x, test_y = mnist.test()

    if central:
        train_x = mnist.train_32()
        test_x = mnist.test_32()

    if proc:
        with Timer("process"):
            train_x = process(train_x)
            test_x = process(test_x)

    train_x = train_x.reshape((train_x.shape[0], -1))
    test_x = test_x.reshape((test_x.shape[0], -1))

    if pca_enabled:
        with Timer("PCA"):
            pca = PCA(n_components=50, whiten=True)
            train_x = pca.fit_transform(train_x)
            test_x = pca.transform(test_x)

    with Timer("train"):
        clf = svm.SVC(cache_size=7000)
        clf.fit(train_x, train_y)
        print("Accuracy:", clf.score(test_x, test_y))
示例#3
0
def main(num_epochs=NUM_EPOCHS):
    print("Loading data...")
    dataset = load_data()

    print("Building model and compiling functions...")
    output_layer = build_model(
        input_height=dataset["input_height"], input_width=dataset["input_width"], output_dim=dataset["output_dim"]
    )

    iter_funcs = create_iter_functions(dataset, output_layer, X_tensor_type=T.tensor4)

    print("Starting training...")
    now = time.time()
    try:
        for epoch in train(iter_funcs, dataset):
            print("Epoch {} of {} took {:.3f}s".format(epoch["number"], num_epochs, time.time() - now))
            now = time.time()
            print("  training loss:\t\t{:.6f}".format(epoch["train_loss"]))
            print("  validation loss:\t\t{:.6f}".format(epoch["valid_loss"]))
            print("  validation accuracy:\t\t{:.2f} %%".format(epoch["valid_accuracy"] * 100))

            if epoch["number"] >= num_epochs:
                break

    except KeyboardInterrupt:
        pass

    return output_layer
示例#4
0
def train(baseModel, output_model_path, epochs=1):
    data = get_training_data()
    output = os.path.join("/repos", output_model_path, 'weights.tar')
    logging.info(f'input path: [{baseModel.path}]')
    logging.info(f'output path: [{output}]')
    logging.info(f'epochs: {epochs}')

    base_weight_path = os.path.join("/repos", baseModel.path, "weights.tar")
    try:
        metrics = mnist.train(data,
                              output,
                              epochs=epochs,
                              resume=base_weight_path)
    except Exception as err:
        print(err)

    # Send finish message
    logging.info(f"GRPC_CLIENT_URI: {OPERATOR_URI}")
    try:
        channel = grpc.insecure_channel(OPERATOR_URI)
        stub = service_pb2_grpc.EdgeOperatorStub(channel)
        result = service_pb2.LocalTrainResult(error=0,
                                              datasetSize=2500,
                                              metrics=metrics)

        response = stub.LocalTrainFinish(result)
    except grpc.RpcError as rpc_error:
        logging.error("grpc error: {}".format(rpc_error))
    except Exception as err:
        logging.error('got error: {}'.format(err))

    logging.debug(
        "sending grpc message succeeds, response: {}".format(response))
示例#5
0
def go(pca_enabled=False, centralize=False):
    print("PCA:", pca_enabled)
    print("Centralize:", centralize)

    train_x, train_y = mnist.train()
    test_x, test_y = mnist.test()

    if centralize:
        train_x = mnist.train_32()
        test_x = mnist.test_32()

    train_x = train_x.reshape((train_x.shape[0], -1))
    test_x = test_x.reshape((test_x.shape[0], -1))

    if pca_enabled:
        with Timer("PCA"):
            pca = PCA(n_components=50, whiten=True)
            train_x = pca.fit_transform(train_x)
            test_x = pca.transform(test_x)

    with Timer("train"):
        max_iter = 1000 if centralize or pca_enabled else 200
        clf = MLPClassifier(max_iter=max_iter, verbose=True)
        clf.fit(train_x, train_y)
        print("Accuracy:", clf.score(test_x, test_y))
示例#6
0
def train():
    images, labels = mnist.inputs(['train_img.tfrecords'], mnist.TRAIN_EXAMPLES_NUM,
                                  FLAGS.batch_size, shuffle=True)
    global_step = tf.train.get_or_create_global_step()

    logits, pred = mnist.inference(images, training=True)
    loss = mnist.loss(logits, labels)
    train_op = mnist.train(loss, global_step)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        init_op = tf.group(
            tf.local_variables_initializer(),
            tf.global_variables_initializer())
        sess.run(init_op)
        ckpt = os.path.join(FLAGS.train_dir, 'model.ckpt')

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess, coord=coord)

        for i in range(1, FLAGS.max_step + 1):
            _, train_loss, predict, label = sess.run([train_op, loss, pred, labels])
            # print(predict, '\n', label)
            if i % 100 == 0:
                print('step: {}, loss: {}'.format(i, train_loss))
                # print(predict, '\n', label)
                saver.save(sess, ckpt, global_step=i)

        coord.request_stop()
        coord.join(threads)
def main():
    images, labels = inputs()
    reshaped_images = tf.reshape(images, [
        mnist.BATCH_SIZE, mnist.IMAGE_HEIGHT, mnist.IMAGE_WIDTH,
        mnist.IMAGE_DEPTH
    ])
    logits = mnist.inference(reshaped_images)
    loss = mnist.loss(logits, labels)
    accuracy = mnist.accuracy(logits, labels)
    train_op = mnist.train(loss)
    init = tf.initialize_all_variables()
    with tf.Session() as sess:
        sess.run(init)
        for index in range(NUM_STEPS):
            batch_x, batch_y = mnist_data.train.next_batch(mnist.BATCH_SIZE)
            _, loss_value = sess.run([train_op, loss],
                                     feed_dict={
                                         images: batch_x,
                                         labels: batch_y
                                     })
            print("step:" + str(index + 1) + " loss: " + str(loss_value))
            if (index + 1) % 10 == 0:
                validation_x, validation_y = mnist_data.validation.next_batch(
                    mnist.BATCH_SIZE)
                accuracy_score = sess.run(accuracy,
                                          feed_dict={
                                              images: validation_x,
                                              labels: validation_y
                                          })
                print("accuracy : " + str(accuracy_score))
示例#8
0
def main(num_epochs=NUM_EPOCHS):
    dataset = load_data()

    output_layer = build_model(
        input_height=dataset['input_height'],
        input_width=dataset['input_width'],
        output_dim=dataset['output_dim'],
        )

    iter_funcs = create_iter_functions(
        dataset,
        output_layer,
        X_tensor_type=T.tensor4,
        )

    print("Starting training...")

    for epoch in train(iter_funcs, dataset):
        print("Epoch %d of %d" % (epoch['number'], num_epochs))
        print("  training loss:\t\t%.6f" % epoch['train_loss'])
        print("  validation loss:\t\t%.6f" % epoch['valid_loss'])
        print("  validation accuracy:\t\t%.2f %%" %
              (epoch['valid_accuracy'] * 100))

        if epoch['number'] >= num_epochs:
            break
示例#9
0
def go(proc, central):
    print("kNN")
    print("0-1", proc)
    print("central", central)

    train_x, train_y = mnist.train()
    test_x, test_y = mnist.test()

    if central:
        train_x = mnist.train_32()
        test_x = mnist.test_32()

    if proc:
        with Timer("process"):
            train_x = process(train_x)
            test_x = process(test_x)

    train_x = train_x.reshape((train_x.shape[0], -1))
    test_x = test_x.reshape((test_x.shape[0], -1))

    with Timer("kNN fit"):
        neigh = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
        neigh.fit(train_x, train_y)

    with Timer("kNN test"):
        print("Accuracy:", neigh.score(test_x, test_y))
def main():
    fraction = 0.4
    min_after_dequeue = int(mnist.NUM_EXAMPLES_PER_EPOCH * fraction)
    images, labels = mnist_inputs(min_after_dequeue)
    validation_images, validation_labels = mnist_inputs(2000,
                                                        train=False,
                                                        num_epochs=None)
    with tf.variable_scope("inference") as scope:
        logits = mnist.inference(images)
        scope.reuse_variables()
        validation_logits = mnist.inference(validation_images)
    loss = mnist.loss(logits, labels)
    tf.scalar_summary("cross_entropy", loss)
    accuracy = mnist.accuracy(validation_logits, validation_labels)
    tf.scalar_summary("validation_accuracy", accuracy)
    train_op = mnist.train(loss)
    sess = tf.Session()
    sess.run(tf.initialize_local_variables())
    sess.run(tf.initialize_all_variables())
    tf.train.start_queue_runners(sess=sess)
    merge = tf.merge_all_summaries()
    writer = tf.train.SummaryWriter(
        "/home/windows98/PycharmProjects/mnist/Summary/")
    for index in range(NUM_STEPS):
        _, loss_value, summary = sess.run([train_op, loss, merge])
        writer.add_summary(summary, index + 1)
        # accuracy_score, summary = sess.run([accuracy, summary])
        # writer.add_summary(summary, index+1)
        print("step:" + str(index + 1) + " loss: " + str(loss_value))
示例#11
0
def main(num_epochs=NUM_EPOCHS):
    dataset = load_data()

    output_layer = build_model(
        input_width=dataset['input_width'],
        input_height=dataset['input_width'],
        output_dim=dataset['output_dim'],
        )

    iter_funcs = create_iter_functions(
        dataset,
        output_layer,
        X_tensor_type=T.tensor4,
        )

    print("Starting training...")
    for epoch in train(iter_funcs, dataset):
        print("Epoch %d of %d" % (epoch['number'], num_epochs))
        print("  training loss:\t\t%.6f" % epoch['train_loss'])
        print("  validation loss:\t\t%.6f" % epoch['valid_loss'])
        print("  validation accuracy:\t\t%.2f %%" %
              (epoch['valid_accuracy'] * 100))

        if epoch['number'] >= num_epochs:
            break

    return output_layer
示例#12
0
def main(argv):
    args = parser.parse_args(argv[1:])
    width = 45
    train_x, train_y = mnist.train()
    test_x, test_y = mnist.test()

    if args.central:
        print("Use centralize now")
        width = 32
        train_x = mnist.train_32()
        test_x = mnist.test_32()

    train_x = train_x / 256
    test_x = test_x / 256

    train_y = train_y.astype(np.int32)
    test_y = test_y.astype(np.int32)
    mnist_classifier = tf.estimator.Estimator(model_fn=cnn_model_fn,
                                              params={"width": width})
    train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": train_x},
                                                        y=train_y,
                                                        batch_size=100,
                                                        num_epochs=None,
                                                        shuffle=True)
    mnist_classifier.train(input_fn=train_input_fn, steps=10000)
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": test_x},
                                                       y=test_y,
                                                       num_epochs=1,
                                                       shuffle=False)
    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
    print(eval_results)
示例#13
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.train.get_or_create_global_step()

        # Get images and labels for CIFAR-10.
        # Force input pipeline to CPU:0 to avoid operations sometimes ending up on
        # GPU and resulting in a slow down.
        with tf.device('/cpu:0'):
            images, labels = mnist.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = mnist.inference(images)

        # Calculate loss.
        loss = mnist.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = mnist.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)  # Asks for loss value.

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value = run_values.results
                    examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    tf.train.NanTensorHook(loss),
                    _LoggerHook()
                ],
                config=tf.ConfigProto(log_device_placement=FLAGS.
                                      log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
示例#14
0
def train():

    with tf.Graph().as_default():
        global_step = tf.train.get_or_create_global_step()

        with tf.device('/cpu:0'):
            images, labels = mnist.distorted_inputs()
        print(global_step)

        #with tf.device('/gpu:0'):
        logits = mnist.inference(images)
        #with tf.device('/gpu:0'):
        loss = mnist.loss(logits, labels)
        #with tf.device('/gpu:0'):
        train_op = mnist.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value = run_values.results
                    examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    tf.train.NanTensorHook(loss),
                    _LoggerHook()
                ],
                config=tf.ConfigProto(log_device_placement=FLAGS.
                                      log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
示例#15
0
def train_and_validation():
    training_dataset = tf.data.TFRecordDataset(['./train_img.tfrecords'])
    validation_dataset = tf.data.TFRecordDataset(['./validation_img.tfrecords'])
    test_dataset = tf.data.TFRecordDataset(['./test_img.tfrecords'])

    training_dataset = training_dataset.map(mnist.parse_data)
    training_dataset = training_dataset.shuffle(50000).batch(FLAGS.batch_size).repeat()
    validation_dataset = validation_dataset.map(mnist.parse_data).batch(FLAGS.batch_size)
    test_dataset = test_dataset.map(mnist.parse_data).batch(FLAGS.batch_size)

    iterator = tf.data.Iterator.from_structure(output_types=training_dataset.output_types,
                                               output_shapes=training_dataset.output_shapes)

    training_init_op = iterator.make_initializer(training_dataset)
    validation_init_op = iterator.make_initializer(validation_dataset)
    test_init_op = iterator.make_initializer(test_dataset)
    images, labels = iterator.get_next()

    training = tf.placeholder(dtype=tf.bool)
    logits, pred = mnist.inference(images, training=training)
    loss = mnist.loss(logits, labels)
    top_k_op = tf.nn.in_top_k(logits, labels, 1)
    global_step = tf.train.get_or_create_global_step()
    train_op = mnist.train(loss, global_step)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(training_init_op)
        print('begin to train!')
        ckpt = os.path.join(FLAGS.train_dir, 'model.ckpt')
        train_step = 0
        while train_step < FLAGS.max_step:
            _, train_loss, step, label = sess.run([train_op, loss, global_step, labels], feed_dict={training: True})
            train_step += 1
            if train_step % 100 == 0:
                saver.save(sess, ckpt, train_step)
                if train_step % 1000 == 0:
                    precision = evaluate(sess, top_k_op, training, mnist.TRAIN_EXAMPLES_NUM)
                    print('step: {}, loss: {}, training precision: {}'.format(train_step, train_loss, precision))
                sess.run(validation_init_op)
                precision = evaluate(sess, top_k_op, training, mnist.VALIDATION_EXAMPLES_NUM)
                print('step: {}, loss: {}, validation precision: {}'.format(train_step, train_loss, precision))
                sess.run(training_init_op)
        sess.run(test_init_op)
        precision = evaluate(sess, top_k_op, training, mnist.TEST_EXAMPLES_NUM)
        print('finally test precision: {}'.format(precision))
def main(num_epochs=NUM_EPOCHS):
    print("Loading data...")
    dataset = load_data()

    print("Building model and compiling functions...")
    output_layer = build_model(
        input_height=dataset['input_height'],
        input_width=dataset['input_width'],
        output_dim=dataset['output_dim'],
    )

    iter_funcs = create_iter_functions(
        dataset,
        output_layer,
        X_tensor_type=T.tensor4,
    )

    print("Starting training...")
    now = time.time()
    try:
        for epoch in train(iter_funcs, dataset):
            print("Epoch {} of {} took {:.3f}s".format(
                epoch['number'], num_epochs, time.time() - now))
            now = time.time()
            print("  training loss:\t\t{:.6f}".format(epoch['train_loss']))
            print("  validation loss:\t\t{:.6f}".format(epoch['valid_loss']))
            print("  validation accuracy:\t\t{:.2f} %%".format(
                epoch['valid_accuracy'] * 100))

            if epoch['number'] % 100 is 0: 
                #### save and load weights
                # save
                weights_save = lasagne.layers.get_all_param_values(output_layer)
                pickle.dump( weights_save, open( "weights_epoch%d.pkl" % epoch['number'], "wb" ) )
                # load
                #weights_load = pickle.load( open( "weights.pkl", "rb" ) )
                #lasagne.layers.set_all_param_values(output_layer, weights_load)

 
            if epoch['number'] >= num_epochs:
                break

    except KeyboardInterrupt:
        pass

    return output_layer
示例#17
0
def main():
    BATCH_SIZE = 100
    MAX_PHRASE_LENGTH = 108
    NUM_DATAPOINTS = 10000

    print("Loading data...")
    dataset = load_data(n=NUM_DATAPOINTS, max_phrase_length=MAX_PHRASE_LENGTH)

    print("Building model and compiling functions...")
    output_layer = build_model(
        #input_height=dataset['input_height'],
        #input_width=dataset['input_width'],
        batch_size=BATCH_SIZE,
        max_phrase_length=MAX_PHRASE_LENGTH,
        output_dim=dataset['output_dim'],
    )

    iter_funcs = create_iter_functions(
        dataset,
        output_layer,
        X_tensor_type=T.tensor3,
        batch_size=BATCH_SIZE,
        learning_rate=0.001,
        momentum=0.9,
        )

    print("Starting training...")
    now = time.time()
    try:
        for epoch in train(iter_funcs, dataset, BATCH_SIZE):
            print("Epoch {} of {} took {:.3f}s".format(
                epoch['number'], num_epochs, time.time() - now))
            now = time.time()
            print("  training loss:\t\t{:.6f}".format(epoch['train_loss']))
            print("  validation loss:\t\t{:.6f}".format(epoch['valid_loss']))
            print("  validation accuracy:\t\t{:.2f} %%".format(
                epoch['valid_accuracy'] * 100))

            if epoch['number'] >= num_epochs:
                break

    except KeyboardInterrupt:
        pass

    return output_layer
示例#18
0
def main(num_epochs=NUM_EPOCHS):
    print("Loading data...")
    dataset = load_data()

    print("Building model and compiling functions...")
    output_layer = build_model(
        input_height=dataset['input_height'],
        input_width=dataset['input_width'],
        output_dim=dataset['output_dim'],
    )

    iter_funcs = create_iter_functions(
        dataset,
        output_layer,
        X_tensor_type=T.tensor4,
    )

    print("Starting training...")
    now = time.time()
    try:
        for epoch in train(iter_funcs, dataset):
            print("Epoch {} of {} took {:.3f}s".format(epoch['number'],
                                                       num_epochs,
                                                       time.time() - now))
            now = time.time()
            print("  training loss:\t\t{:.6f}".format(epoch['train_loss']))
            print("  validation loss:\t\t{:.6f}".format(epoch['valid_loss']))
            print("  validation accuracy:\t\t{:.2f} %%".format(
                epoch['valid_accuracy'] * 100))

            if epoch['number'] >= num_epochs:
                break

    except KeyboardInterrupt:
        pass

    return output_layer
示例#19
0
        teacher_out = teacher.forward_pass(x)
        student.train(x, teacher_out)


if __name__ == "__main__":
    teacher_layers = [
        Layer(784, 16, LeakyReLU()),
        Layer(16, 16, LeakyReLU()),
        Layer(16, 10, LeakyReLU()),
    ]
    teacher_net = NeuralNetwork(teacher_layers, CrossEntropyLoss(), 0.001)

    train_data = load_data("mnistdata/mnist_train.csv",
                           delimiter=",",
                           dtype=int)
    train(teacher_net, train_data)

    test_data = load_data("mnistdata/mnist_test.csv", delimiter=",", dtype=int)
    accuracy = test(teacher_net, test_data)
    print(f"Accuracy of the teacher net is {100*accuracy:.2f}")

    student_layers = [
        Layer(784, 10, Sigmoid()),
    ]
    student_net = NeuralNetwork(student_layers, MSELoss(), 0.005)

    train_student(student_net, teacher_net, train_data)

    student_accuracy = test(student_net, test_data)
    print(f"Accuracy of the student net is {100*accuracy:.2f}")
    test_losses.append(test_loss)
    print('\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, 10000, 100. * correct / 10000))
    return confusion


network = network.float()

test()
optim_params = {}
optim_params['lr'] = 0.0005
optim_params['momentum'] = .9
confusion = np.zeros((10, 10), dtype='i4')
mnist.train(network,
            n_epochs=5,
            log_interval=100,
            optim_params=optim_params,
            batch_size=100)
optim_params['lr'] = 0.0001
mnist.train(network,
            n_epochs=5,
            log_interval=100,
            optim_params=optim_params,
            batch_size=100)
print(mnist.get_acc(network))
torch.save(network.state_dict(), 'model.pth')
print(confusion)

newFile = open('Data/CNN Data.txt', 'w')
newFile.write(str(confusion))
示例#21
0
import os

import horovod.tensorflow as hvd
import mnist

if __name__ == '__main__':
    hvd.init()
    config = tf.ConfigProto()
    # Only use the GPU that horovod gives us
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Synchronize initial values across workers
    hvd.BroadcastGlobalVariablesHook(0).on_train_begin()

    ctx = mnist.Context()
    ctx.random_dim = 100
    ctx.filters = 64
    ctx.batch_size = 128
    ctx.epochs = 51
    ctx.opt = hvd.DistributedOptimizer(
        tf.keras.optimizers.Adam(lr=0.0002 * hvd.size(), beta_1=0.5))

    mnist.load_data(ctx)
    ctx.generator = mnist.greate_dc_generator(ctx)
    ctx.discriminator = mnist.create_discriminator(ctx)
    ctx.gan = mnist.create_GAN(ctx)

    mnist.train(ctx)
示例#22
0
def train():
    """
	训练mnist网络
	"""
    # with tf.Graph().as_default():
    # 	global_step = tf.contrib.framework.get_or_create_global_step()
    global_step = tf.contrib.framework.get_or_create_global_step()

    #初始化所有参数
    init = tf.global_variables_initializer()

    #获取(image, label)batch pair
    image_batch, label_batch = mnist.inputs('train')

    #损失函数sparse_softmax_cross_entropy_with_logits要求rank_of_labels = rank_of_images - 1
    #对label_batch作扁平化处理
    label_batch = tf.reshape(label_batch, [50])

    #扩展image维度,从[batch, row, col]转换为[batch, row, col, depth=1]
    expand_image_batch = tf.expand_dims(image_batch, -1)

    #损失函数使用sparse_softmax_cross_entropy_with_logits(),自动完成one_hot编码转化
    #将label数据由标量转换为one_hot编码形式
    # labels_one_hot = dense_to_one_hot(label_batch, 10)

    #创建mnist模型,并计算每个batch样本的logits
    logits = mnist.inference(expand_image_batch, dropout=0.5)

    loss = mnist.loss(logits=logits, labels=label_batch)

    accuracy = mnist.train_accuracy(logits, label_batch)

    train_op = mnist.train(loss, global_step)

    class _LoggerHook(tf.train.SessionRunHook):
        """
		记录损失和运行时间日志信息
		"""
        def begin(self):
            self._step = -1
            self._start_time = time.time()

        def before_run(self, run_context):
            self._step += 1
            # self._start_time = time.time()
            #请求目标tensor的值,在after_run方法中获取
            return tf.train.SessionRunArgs([loss, accuracy])

        def after_run(self, run_context, run_values):
            if self._step % FLAGS.log_frequency == 0:
                _current_time = time.time()
                duration = _current_time - self._start_time

                self._start_time = _current_time

                #提取before_run中请求的损失和精确度值
                loss_value, accuracy_value = run_values.results
                #样本数/秒,秒/batch_size数样本
                examples_per_sec = FLAGS.batch_size * FLAGS.log_frequency / duration
                sec_per_batch = float(duration / FLAGS.log_frequency)

                #console打印训练状态数据
                #时间:步数,损失,精确度(每秒样本数,每batch样本处理时间)
                format_str = (
                    '%s: step %d, loss=%.2f, accuracy=%.2f(%.1f examples/sec, %.3f sec/batch)'
                )
                print(format_str %
                      (datetime.now(), self._step, loss_value, accuracy_value,
                       examples_per_sec, sec_per_batch))

    #最大训练步数20000,每10步打印一次输出
    #MonitoredTrainingSession默认情况下600s保存一次检查点,每100步保存一次summary
    with tf.train.MonitoredTrainingSession(
            checkpoint_dir=FLAGS.train_dir,
            hooks=[
                tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                tf.train.NanTensorHook(loss),
                _LoggerHook()
            ],
            save_checkpoint_secs=60,
            config=tf.ConfigProto(log_device_placement=False,
                                  allow_soft_placement=True)) as mon_sess:
        mon_sess.run(init)
        while not mon_sess.should_stop():
            # mon_sess.run(init)
            mon_sess.run(train_op)
示例#23
0
def train(FLAGS, mnist_data, handler=None):
    class _Session:
        cancel = False

    session = _Session()
    if handler:

        def fire(typ, data=None):
            try:
                handler.fire(typ, data)
            except WebSocketClosedError:
                print('WebSocket closed error.')
                session.cancel = True

        def cancel():
            session.cancel = True
            fire('cancel')

        def listener(message):
            if message == 'close' or message['event'] == 'cancel':
                cancel()

        handler.listen(listener)
    else:

        def fire(typ, data):
            pass

    with tf.Graph().as_default():
        # Placeholders
        x = tf.placeholder(tf.float32, shape=[None, 784])
        y = tf.placeholder(tf.float32, shape=[None, 10])
        keep_prob = tf.placeholder(tf.float32)

        inference = mnist.inference(x, keep_prob)
        train = mnist.train(inference, y)
        accuracy = mnist.accuracy(inference, y)
        merge = tf.summary.merge_all()

        saver = tf.train.Saver(max_to_keep=200)

        print('Checkpoints directory: %s' % FLAGS.ckpt_dir)
        if tf.gfile.Exists(FLAGS.ckpt_dir):
            print('Cleaning checkpoints...')
            tf.gfile.DeleteRecursively(FLAGS.ckpt_dir)
        tf.gfile.MakeDirs(FLAGS.ckpt_dir)

        # Create Session
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())

        writer = tf.summary.FileWriter(FLAGS.ckpt_dir, sess.graph)

        # Training and Evaluting
        print('Start training.')
        start = time.time()
        for step in range(1, 20000 + 1):
            if session.cancel:
                print('Interrupting training...')
                break
            batch = mnist_data.train.next_batch(50)
            if FLAGS.verbose:
                result = sess.run(inference,
                                  feed_dict={
                                      x: batch[0],
                                      y: batch[1],
                                      keep_prob: 1.0
                                  })
                for i in range(50):
                    t = np.argmax(batch[1][i])
                    i = np.argmax(result[i])
                    if t == i:
                        sys.stdout.write("\033[94mx\033[0m")
                    else:
                        sys.stdout.write("\033[91mx\033[0m")
                sys.stdout.write("... ")
            if (step <= 300 and step % 10 == 0)\
            or (300 < step and step <= 1000 and step % 100 == 0)\
            or step % 200 == 0:
                test_inference, test_accuracy = sess.run(
                    [inference, accuracy],
                    feed_dict={
                        x: mnist_data.test.images,
                        y: mnist_data.test.labels,
                        keep_prob: 1.0
                    })
                fire(
                    'test', {
                        'step': step,
                        'inference': test_inference.argmax(axis=1).tolist(),
                        'accuracy': str(test_accuracy)
                    })
                sys.stdout.write("===> Step %5d, Test Accuracy: %1.02f" %
                                 (step, test_accuracy))
                if (step <= 100 and step % 10 == 0)\
                or (100 < step and step <= 1000 and step % 100 == 0)\
                or step % 1000 == 0:
                    ckpt = saver.save(sess,
                                      os.path.join(FLAGS.ckpt_dir, 'ckpt'),
                                      global_step=step)
                    sys.stdout.write(" @%s" % ckpt)
                    fire('checkpoint', {'step': step})
                print ''
            elif FLAGS.verbose:
                print ''
            _, summary = sess.run([train, merge],
                                  feed_dict={
                                      x: batch[0],
                                      y: batch[1],
                                      keep_prob: 0.5
                                  })
            writer.add_summary(summary, global_step=step)
        elapsed_time = time.time() - start
        print('Total time: {0} [sec]'.format(elapsed_time))
        print('Done!')
        return True
示例#24
0
def run():
    m = mnist.My_VAE_V2(10)
    mnist.train(m, 10)
    mnist.test(m)  # will output dd.png
    return m