Пример #1
0
def CIFAR10_must_converge(name,
                          model_class,
                          optimizer_class,
                          epochs=32,
                          batch_size=500,
                          initial_learning_rate=0.01,
                          summaries=False,
                          use_debug_session=False):
    train_strategy = generate_train_graph(model_class, optimizer_class, 32, 32,
                                          3, 10)

    if summaries:
        filepath = "/tmp/%s/cifar10/train" % name
        train_writer = tf.summary.FileWriter(filepath)
        print("summaries: ", filepath)

    def train(epoch, dataset, batch_size, total, sess, summaries=False):
        average_loss = []
        average_error = []
        eye = np.eye(10)
        total_examples = 0

        def step_decay(epoch):
            initial_lrate = initial_learning_rate
            drop = 0.5
            epochs_drop = 10.0
            lrate = initial_lrate * math.pow(
                drop, math.floor((1 + epoch) / epochs_drop))
            return lrate

        learning_rate = step_decay(epoch)

        while total_examples <= total:
            x_batch, label_batch = dataset.next_batch(batch_size)
            total_examples += len(x_batch)

            # one hot encode
            y_batch = eye[label_batch]
            feed_dict = {
                train_strategy.inputs: x_batch,
                train_strategy.labels: y_batch,
                train_strategy.learning_rate: learning_rate,
                train_strategy.batch_size: batch_size,
                "global_is_training:0": True,
            }

            feed_dict.update(train_strategy.train_parameters)

            fetches = [
                train_strategy.optimize,
                train_strategy.loss,
                train_strategy.global_step,
                train_strategy.predictions,
                train_strategy.categorical_error,
            ]

            _, loss, global_step, predictions, error = sess.run(
                fetches, feed_dict=feed_dict)

            average_loss.append(loss)
            average_error.append(error)

        if summaries:
            fetches = train_strategy.summary_op
            summary = sess.run(fetches, feed_dict=feed_dict)
            train_writer.add_summary(summary)

        return global_step, np.mean(
            average_loss), np.mean(average_error) * 100.

    def test(epoch, dataset, batch_size, total, sess, summaries=False):
        total_examples = 0
        average_error = []
        eye = np.eye(10)
        while total_examples <= total:
            x_batch, label_batch = dataset.next_batch(batch_size)
            total_examples += len(x_batch)
            feed_dict = {
                train_strategy.inputs: x_batch,
                train_strategy.labels: eye[label_batch],
                train_strategy.batch_size: batch_size,
                "global_is_training:0": False,
            }

            fetches = [
                train_strategy.predictions,
                train_strategy.categorical_error,
            ]

            predictions, error = sess.run(fetches, feed_dict=feed_dict)

            average_error.append(error)

        # Add summaries
        if summaries:
            fetches = train_strategy.summary_op
            summary = sess.run(fetches, feed_dict=feed_dict)
            train_writer.add_summary(summary)

        return np.mean(average_error) * 100.0

    with tf.Session(graph=train_strategy.graph) as sess:
        tf.set_random_seed(12345678)
        sess.run(tf.get_collection('init')[0])

        if use_debug_session:
            from tensorflow.python import debug as tf_debug
            sess = tf_debug.LocalCLIDebugWrapperSession(sess)
            sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)

        dataset = cifar.read_data_sets('/tmp/deepwater/cifar10/',
                                       validation_size=0)

        print("computing initial test error ...")
        # test_error = test(0, dataset.test, batch_size,
        #                   dataset.test.num_examples, sess, summaries=summaries)
        #
        # print('initial test error:', test_error)

        for epoch in range(epochs):
            global_step, train_loss, train_error = train(
                epoch, dataset.train, batch_size, dataset.train.num_examples,
                sess)
            test_error = test(epoch,
                              dataset.test,
                              batch_size,
                              dataset.test.num_examples,
                              sess,
                              summaries=summaries)

            print('epoch:', "%d/%d" % (epoch, epochs), 'step', global_step,
                  'train loss:', train_loss, '% train error:', train_error,
                  '% test error:', test_error)
        if summaries:
            train_writer.close()
Пример #2
0
def MNIST_must_converge(name,
                        model_class,
                        optimizer_class,
                        epochs=50,
                        batch_size=32,
                        initial_learning_rate=0.001,
                        summaries=False,
                        use_debug_session=False):
    def train(epoch, dataset, batch_size, total, sess):
        average_loss = []
        average_error = []
        eye = np.eye(10)
        total_examples = 0

        def step_decay(epoch):
            initial_lrate = initial_learning_rate
            drop = 0.5
            epochs_drop = 10.0
            lrate = initial_lrate * math.pow(
                drop, math.floor((1 + epoch) / epochs_drop))
            return lrate

        learning_rate = initial_learning_rate  #step_decay(epoch)

        while total_examples <= total:
            x_batch, label_batch = dataset.next_batch(batch_size)
            total_examples += len(x_batch)
            # one hot encode
            y_batch = eye[label_batch]
            feed_dict = {
                train_strategy.inputs: x_batch,
                train_strategy.labels: y_batch,
                train_strategy.learning_rate: learning_rate,
                train_strategy.batch_size: batch_size,
                "global_is_training:0": True
            }

            feed_dict.update(train_strategy.train_parameters)

            fetches = [
                train_strategy.optimize,
                train_strategy.loss,
                train_strategy.global_step,
                train_strategy.predictions,
                train_strategy.categorical_error,
            ]

            if sess.should_stop():
                return global_step, np.mean(
                    average_loss), np.mean(average_error) * 100.

            #if not sess.should_stop():
            _, loss, global_step, predictions, error = sess.run(
                fetches, feed_dict=feed_dict)

            average_loss.append(loss)
            average_error.append(error)

            err = np.mean(average_error) * 100.0

            if summaries and (total_examples % 10):
                fetches = train_strategy.summary_op
                summary = sess.run(fetches, feed_dict=feed_dict)
                train_writer.add_summary(summary)
                train_writer.flush()
                print("writing summaries")

        return global_step, np.mean(average_loss), err

    def test(dataset, batch_size, total, sess):
        total_examples = 0
        average_error = []
        eye = np.eye(10)

        while total_examples <= total:
            x_batch, label_batch = dataset.next_batch(batch_size)
            total_examples += len(x_batch)

            feed_dict = {
                train_strategy.inputs: x_batch,
                train_strategy.labels: eye[label_batch],
                train_strategy.batch_size: batch_size,
                "global_is_training:0": False,
            }

            fetches = [
                train_strategy.predictions,
                train_strategy.categorical_error,
            ]

            # if not sess.should_stop():
            predictions, error = sess.run(fetches, feed_dict=feed_dict)

            average_error.append(error)

        err = np.mean(average_error) * 100.0
        print("test err: %f" % err)

        return err

    # run test on test set at end just before closing the session
    class TestAtEnd(tf.train.StopAtStepHook):
        def __init__(self, last_step):
            tf.train.StopAtStepHook.__init__(self, last_step=last_step)

        def end(self, session):
            print("computing final test error")
            test(dataset.test, batch_size, dataset.test.num_examples, session)

    print("Testing %s" % name)

    train_strategy = generate_train_graph(model_class,
                                          optimizer_class,
                                          28,
                                          28,
                                          1,
                                          10,
                                          add_summaries=summaries)

    timestamp = datetime.now().strftime("%y%m%d%H%M%S")
    train_writer = tf.summary.FileWriter("/tmp/%s/train/%s" %
                                         (name, timestamp))

    class _LoggerHook(tf.train.SessionRunHook):
        """Logs loss and runtime."""
        def begin(self):
            self._step = -1

        def before_run(self, run_context):
            self._step += 1
            self._start_time = time.time()
            return tf.train.SessionRunArgs(
                train_strategy.loss)  # Asks for loss value.

        def after_run(self, run_context, run_values):
            duration = time.time() - self._start_time
            loss_value = run_values.results
            if self._step % 10 == 0 and print_logs:
                num_examples_per_step = batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), self._step, loss_value,
                                    examples_per_sec, sec_per_batch))

    with train_strategy.graph.as_default():
        dataset = read_data_sets('/tmp/deepwater/datasets/', validation_size=0)
        checkpoint_directory = "/tmp/checkpoint"
        checkpoint_file = checkpoint_directory + "/checkpoint"
        if os.path.isfile(checkpoint_file):
            os.remove(checkpoint_file)
        start_time = time.time()
        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=True)
        config.gpu_options.allow_growth = True
        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=checkpoint_directory,
                hooks=[
                    TestAtEnd(epochs * dataset.train.num_examples),
                    _LoggerHook()
                ],
                config=config) as sess:

            epoch = 0

            tf.set_random_seed(12345678)

            if use_debug_session:
                from tensorflow.python import debug as tf_debug
                sess = tf_debug.LocalCLIDebugWrapperSession(sess)
                sess.add_tensor_filter("has_inf_or_nan",
                                       tf_debug.has_inf_or_nan)

            if not use_debug_session:
                print('computing initial test error')
                test_error = test(dataset.test, batch_size,
                                  dataset.test.num_examples, sess)
                print('initial test error: %f' % (test_error))

            while not sess.should_stop() and epoch < epochs:
                print('epoch: %d' % (epoch))
                epoch += 1
                global_step, train_loss, train_error = train(
                    epoch, dataset.train, batch_size,
                    dataset.train.num_examples, sess)
                print("train: avg loss %f  error %f" %
                      (train_loss, train_error))

            train_writer.close()

        elapsed_time = time.time() - start_time
        print("time %.2f s\n" % elapsed_time)
Пример #3
0
def cat_dog_mouse_must_converge(name,
                                model_class,
                                optimizer_class,
                                epochs=20,
                                batch_size=500,
                                initial_learning_rate=0.01,
                                summaries=False,
                                dim=299):
    def create_batches(batch_size, images, labels):
        images_batch = []
        labels_batch = []

        for img in images:
            imreadi = imresize(imread(img),
                               [dim, dim]).reshape(1, dim * dim * 3)
            images_batch.append(imreadi)

        modulus = len(images_batch) % batch_size

        print(len(images_batch))

        if modulus != 0:
            i = 0
            while len(images_batch) % batch_size != 0:
                imreadi = imresize(imread(images[i]),
                                   [dim, dim]).reshape(1, dim * dim * 3)
                images_batch.append(imreadi)
                i += 1

        print(len(images_batch))

        for label in labels:
            labels_batch.append(label)

        if modulus != 0:
            i = 0
            while len(labels_batch) % batch_size != 0:
                labels_batch.append(labels[i])
                i += 1

        labels_batch = np.asarray(labels_batch)

        while (True):
            for i in range(0, len(images_batch), batch_size):
                b = random.sample(range(0, len(images_batch)), batch_size)
                yield ([images_batch[i]
                        for i in b], [labels_batch[i] for i in b])

    def train(batch_generator, sess, momentum):

        global trained_global
        trained = 0

        learning_rate = initial_learning_rate

        while trained + batch_size <= 288:
            batched_images, batched_labels = next(batch_generator)
            images = np.asarray(batched_images).reshape(
                batch_size, dim * dim * 3)
            labels = eye[batched_labels]

            trained += batch_size

            feed_dict = {
                train_strategy.inputs: images,
                train_strategy.labels: labels,
                train_strategy.learning_rate: learning_rate,
                "momentum:0": momentum,
                "global_is_training:0": True,
            }

            feed_dict.update(train_strategy.train_parameters)

            fetches = [train_strategy.optimize]

            # print(sess.run( ))

            sess.run(fetches, feed_dict=feed_dict)

        trained_global += trained
        # print('trained %d' % (trained_global))

    def test(batch_generator, sess, momentum):

        global trained_global
        trained = 0

        average_loss = []
        average_error = []

        learning_rate = initial_learning_rate

        while trained + batch_size <= 288:
            batched_images, batched_labels = next(batch_generator)
            images = np.asarray(batched_images).reshape(
                batch_size, dim * dim * 3)
            labels = eye[batched_labels]

            trained += batch_size

            feed_dict = {
                train_strategy.inputs: images,
                train_strategy.labels: labels,
                train_strategy.learning_rate: learning_rate,
                "momentum:0": momentum,
                "global_is_training:0": False,
            }

            feed_dict.update(train_strategy.train_parameters)

            fetches = [
                train_strategy.loss,
                train_strategy.global_step,
                train_strategy.predictions,
                train_strategy.categorical_error,
            ]

            loss, global_step, predictions, error = sess.run(
                fetches, feed_dict=feed_dict)

            average_loss.append(loss)
            average_error.append(error)

        return 0, np.mean(average_loss), np.mean(average_error) * 100.

    def read_labeled_image_list(image_list_file):
        f = open(image_list_file, 'r')
        filenames = []
        labels = []
        label_domain = ['cat', 'dog', 'mouse']
        for line in f:
            filename, label = line[:-1].split(' ')
            filenames.append(filename)
            labels.append(label_domain.index(label))
        return filenames, labels

    train_strategy = generate_train_graph(model_class,
                                          optimizer_class,
                                          dim,
                                          dim,
                                          3,
                                          3,
                                          add_summaries=summaries)

    class _LoggerHook(tf.train.SessionRunHook):
        """Logs loss and runtime."""
        def begin(self):
            self._step = -1

        def before_run(self, run_context):
            self._step += 1
            self._start_time = time.time()
            return tf.train.SessionRunArgs(
                train_strategy.loss)  # Asks for loss value.

        def after_run(self, run_context, run_values):
            duration = time.time() - self._start_time
            loss_value = run_values.results
            if self._step % 10 == 0 and print_logs:
                num_examples_per_step = batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), self._step, loss_value,
                                    examples_per_sec, sec_per_batch))

    with train_strategy.graph.as_default():
        epoch = 0

        # tf.set_random_seed(12345678)

        # Load the data
        image, labels = read_labeled_image_list(
            "bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv")

        batch_generator = create_batches(batch_size, image, labels)
        start_time = time.time()
        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=True)
        config.gpu_options.allow_growth = True
        with tf.train.MonitoredTrainingSession(hooks=[_LoggerHook()],
                                               config=config) as sess:
            momentum_s = 0.9
            momentum_e = 0.91
            for momentum in np.arange(momentum_s, momentum_e,
                                      (momentum_e - momentum_s) / epochs):
                epoch += 1
                eye = np.eye(3)
                train(batch_generator, sess, momentum)

                global_step, train_loss, train_error = test(
                    batch_generator, sess, momentum)

                if epoch % 5 == 0:
                    print('epoch:', "%d/%d" % (epoch, epochs), 'step',
                          global_step, 'test loss:', train_loss,
                          '% test error:', train_error)

            elapsed_time = time.time() - start_time
            print("time %.2f s\n" % elapsed_time)

            global_step, train_loss, train_error = test(
                batch_generator, sess, momentum)
            print('final train error: %f' % (train_error))

            return train_error
Пример #4
0
def MNIST_must_converge(modelClass, optimizerClass, epochs=20, batch_size=500):
    trainStrategy = generate_train_graph(modelClass, optimizerClass, 28, 28, 1,
                                         10)

    train_writer = tf.summary.FileWriter("/tmp/%s/train" % "test")
    test_writer = tf.summary.FileWriter("/tmp/%s/test" % "test")

    print("logging at %s" % "/tmp/test//test")

    def train(epoch, dataset, batch_size, total, sess, summaries=True):
        average_loss = []
        average_error = []
        eye = np.eye(10)
        total_examples = 0
        error = 0

        def step_decay(epoch):
            initial_lrate = 0.1
            drop = 0.5
            epochs_drop = 10.0
            lrate = initial_lrate * math.pow(
                drop, math.floor((1 + epoch) / epochs_drop))
            return lrate

        learning_rate = step_decay(epoch)
        #learning_rate = 0.01
        print(learning_rate)

        while total_examples != total:
            x_batch, label_batch = dataset.next_batch(batch_size)
            total_examples += len(x_batch)
            # one hot encode
            y_batch = eye[label_batch]
            opt = trainStrategy.optimize
            feed_dict = {
                trainStrategy.inputs: x_batch,
                trainStrategy.labels: y_batch,
                trainStrategy.learning_rate: learning_rate,
            }

            feed_dict.update(trainStrategy.train_parameters)

            fetches = [
                trainStrategy.optimize,
                trainStrategy.loss,
                trainStrategy.global_step,
                trainStrategy.predictions,
                trainStrategy.categorical_error,
            ]

            _, loss, global_step, predictions, error = sess.run(
                fetches, feed_dict=feed_dict)

            average_loss.append(loss)
            average_error.append(error)

        if summaries:
            fetches = trainStrategy.summary_op
            summary = sess.run(fetches, feed_dict=feed_dict)
            train_writer.add_summary(summary)

        return global_step, np.mean(
            average_loss), np.mean(average_error) * 100.

    def test(epoch, dataset, batch_size, total, sess, summaries=True):
        total_examples = 0
        error = 0
        average_error = []
        eye = np.eye(10)
        while total_examples != total:
            x_batch, label_batch = dataset.next_batch(batch_size)
            total_examples += len(x_batch)

            feed_dict = {
                trainStrategy.inputs: x_batch,
                trainStrategy.labels: eye[label_batch],
            }

            fetches = [
                trainStrategy.predictions,
                trainStrategy.categorical_error,
            ]

            predictions, error = sess.run(fetches, feed_dict=feed_dict)

            average_error.append(error)

        # Add summaries
        if summaries:
            fetches = trainStrategy.summary_op
            summary = sess.run(fetches, feed_dict=feed_dict)
            train_writer.add_summary(summary)

        return np.mean(average_error) * 100.0

    with tf.Session(graph=trainStrategy.graph) as sess:
        tf.set_random_seed(12345678)
        sess.run(tf.get_collection('init')[0])

        # from tensorflow.python import debug as tf_debug
        # sess = tf_debug.LocalCLIDebugWrapperSession(sess)
        # sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
        summaries = False

        dataset = read_data_sets('/tmp/deepwater/datasets/', validation_size=0)

        test_error = test(0,
                          dataset.test,
                          batch_size,
                          dataset.test.num_examples,
                          sess,
                          summaries=summaries)
        print('initial test error:', test_error)

        for epoch in range(epochs):
            global_step, train_loss, train_error = train(
                epoch, dataset.train, batch_size, dataset.train.num_examples,
                sess)
            test_error = test(epoch,
                              dataset.test,
                              batch_size,
                              dataset.test.num_examples,
                              sess,
                              summaries=summaries)

            print('epoch:', "%d/%d" % (epoch, epochs), 'step', global_step,
                  'train loss:', train_loss, '% train error:', train_error,
                  '% test error:', test_error)

        test_writer.close()
        train_writer.close()