def _map_fun(args, ctx):
            import tensorflow as tf
            cluster, server = TFNode.start_cluster_server(ctx)
            if ctx.job_name == "ps":
                server.join()
            elif ctx.job_name == "worker":
                with tf.device(
                        tf.train.replica_device_setter(
                            worker_device="/job:worker/task:%d" %
                            ctx.task_index,
                            cluster=cluster)):
                    x = tf.placeholder(tf.int32, [None, 1])
                    sq = tf.square(x)
                    init_op = tf.global_variables_initializer()
                with tf.train.MonitoredTrainingSession(
                        is_chief=(ctx.task_index == 0)) as sess:
                    tf_feed = TFNode.DataFeed(ctx.mgr, False)
                    while not sess.should_stop() and not tf_feed.should_stop():
                        batch = tf_feed.next_batch(10)
                        if len(batch) > 0:
                            outputs = sess.run([sq], feed_dict={x: batch})
                            tf_feed.batch_results(outputs[0])

                # simulate post-feed actions that raise an exception
                time.sleep(2)
                raise Exception("FAKE exception after feeding")
예제 #2
0
    def test_datafeed(self):
        mgr = TFManager.start('abc', ['input', 'output'], 'local')

        # insert 10 numbers followed by an end-of-feed marker
        q = mgr.get_queue('input')
        for i in range(10):
            q.put(i)
        q.put(None)

        feed = TFNode.DataFeed(mgr)

        # [0,1]
        self.assertFalse(feed.done_feeding)
        batch = feed.next_batch(2)
        self.assertEqual(2, len(batch))
        self.assertEqual(1, sum(batch))

        # [2,3,4,5]
        batch = feed.next_batch(4)
        self.assertEqual(4, len(batch))
        self.assertEqual(14, sum(batch))

        # [6,7,8,9]
        batch = feed.next_batch(10)
        self.assertEqual(4, len(batch))
        self.assertEqual(30, sum(batch))

        # should be done
        self.assertTrue(feed.should_stop())
    def test_datafeed(self):
        """TFNode.DataFeed basic operations"""
        mgr = TFManager.start('abc', ['input', 'output'], 'local')

        # insert 10 numbers followed by an end-of-feed marker
        q = mgr.get_queue('input')
        for i in range(10):
            q.put(i)
        q.put(None)  # end-of-feed marker

        feed = TFNode.DataFeed(mgr)

        # [0,1]
        self.assertFalse(feed.done_feeding)
        batch = feed.next_batch(2)
        self.assertEqual(len(batch), 2)
        self.assertEqual(sum(batch), 1)

        # [2,3,4,5]
        self.assertFalse(feed.done_feeding)
        batch = feed.next_batch(4)
        self.assertEqual(len(batch), 4)
        self.assertEqual(sum(batch), 14)

        # [6,7,8,9]
        self.assertFalse(feed.done_feeding)
        batch = feed.next_batch(10)  # ask for more than available
        self.assertEqual(len(batch), 4)
        self.assertEqual(sum(batch), 30)

        # should be done
        self.assertTrue(feed.should_stop())
예제 #4
0
파일: worker.py 프로젝트: linxigal/tfos
 def __call__(self, args, ctx):
     self.task_index = ctx.task_index
     self.job_name = ctx.job_name
     self.cluster, self.server = TFNode.start_cluster_server(ctx)
     self.tf_feed = TFNode.DataFeed(ctx.mgr)
     if ctx.job_name == "ps":
         self.server.join()
     elif ctx.job_name == "worker":
         self.build_model()
         self.execute()
        def _map_fun(args, ctx):
            import tensorflow as tf

            tf_feed = TFNode.DataFeed(ctx.mgr, False)
            while not tf_feed.should_stop():
                batch = tf_feed.next_batch(batch_size=10)
                print("batch: {}".format(batch))
                squares = tf.math.square(batch)
                print("squares: {}".format(squares))
                tf_feed.batch_results(squares.numpy())
        def _map_fun(args, ctx):
            import tensorflow as tf

            tf_feed = TFNode.DataFeed(ctx.mgr, False)
            while not tf_feed.should_stop():
                batch = tf_feed.next_batch(10)
                if len(batch) > 0:
                    squares = tf.math.square(batch)
                    tf_feed.batch_results(squares.numpy())
                    raise Exception("FAKE exception during feeding")
예제 #7
0
    def _spark_train(args, ctx):
      """Basic linear regression in a distributed TF cluster using InputMode.SPARK"""
      import tensorflow as tf
      from tensorflowonspark import TFNode

      tf.reset_default_graph()                          # reset graph in case we're re-using a Spark python worker

      cluster, server = TFNode.start_cluster_server(ctx)
      if ctx.job_name == "ps":
        server.join()
      elif ctx.job_name == "worker":
        with tf.device(tf.train.replica_device_setter(
          worker_device="/job:worker/task:%d" % ctx.task_index,
          cluster=cluster)):
          x = tf.placeholder(tf.float32, [None, 2], name='x')
          y_ = tf.placeholder(tf.float32, [None, 1], name='y_')
          w = tf.Variable(tf.truncated_normal([2,1]), name='w')
          y = tf.matmul(x, w, name='y')
          y2 = tf.square(y, name="y2")                      # extra/optional output for testing multiple output tensors
          cost = tf.reduce_mean(tf.square(y_ - y), name='cost')
          optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(cost)
          init_op = tf.global_variables_initializer()
          saver = tf.train.Saver()

        sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0),
                                init_op=init_op)
        with sv.managed_session(server.target) as sess:
          tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping)
          while not sv.should_stop() and not tf_feed.should_stop():
            batch = tf_feed.next_batch(10)
            if args.input_mapping:
              if len(batch['x']) > 0:
                feed = { x: batch['x'], y_: batch['y_'] }
              opt = sess.run(optimizer, feed_dict=feed)

          if sv.is_chief:
            if args.model_dir:
              # manually save checkpoint
              ckpt_name = args.model_dir + "/model.ckpt"
              print("Saving checkpoint to: {}".format(ckpt_name))
              saver.save(sess, ckpt_name)
            elif args.export_dir:
              # export a saved_model
              signatures = {
                'test_key': {
                  'inputs': { 'features': x },
                  'outputs': { 'prediction': y },
                  'method_name': 'test'
                }
              }
              TFNode.export_saved_model(sess, export_dir=args.export_dir, tag_set='test_tag', signatures=signatures)
            else:
              print("WARNING: model state not saved.")

        sv.stop()
예제 #8
0
파일: mtcnn.py 프로젝트: linxigal/tfos
 def __call__(self, args, ctx):
     self.task_index = ctx.task_index
     self.job_name = ctx.job_name
     self.cluster, self.server = TFNode.start_cluster_server(ctx)
     self.tf_feed = TFNode.DataFeed(ctx.mgr)
     if ctx.job_name == "ps":
         self.server.join()
     elif ctx.job_name == "worker":
         self.create_tmp_dir()
         self.process()
         self.delete_tmp_dir()
        def _map_fun(args, ctx):
            import tensorflow as tf

            tf_feed = TFNode.DataFeed(ctx.mgr, False)
            while not tf_feed.should_stop():
                batch = tf_feed.next_batch(10)
                if len(batch) > 0:
                    squares = tf.math.square(batch)
                    tf_feed.batch_results(squares.numpy())

            # simulate post-feed actions that raise an exception
            time.sleep(2)
            raise Exception("FAKE exception after feeding")
 def _map_fun(args, ctx):
     import tensorflow as tf
     cluster, server = TFNode.start_cluster_server(ctx)
     if ctx.job_name == "ps":
         server.join()
     elif ctx.job_name == "worker":
         with tf.device(
                 tf.train.replica_device_setter(
                     worker_device="/job:worker/task:%d" %
                     ctx.task_index,
                     cluster=cluster)):
             x = tf.placeholder(tf.int32, [None, 1])
             sq = tf.square(x)
             init_op = tf.global_variables_initializer()
         sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0),
                                  init_op=init_op)
         with sv.managed_session(server.target) as sess:
             tf_feed = TFNode.DataFeed(ctx.mgr, False)
             while not sv.should_stop() and not tf_feed.should_stop():
                 outputs = sess.run(
                     [sq], feed_dict={x: tf_feed.next_batch(10)})
                 tf_feed.batch_results(outputs[0])
         sv.stop()
예제 #11
0
def main_fun(args, ctx):
    import numpy as np
    import tensorflow as tf
    from tensorflowonspark import compat, TFNode

    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

    def build_and_compile_cnn_model():
        model = tf.keras.Sequential([
            tf.keras.layers.Conv2D(32,
                                   3,
                                   activation='relu',
                                   input_shape=(28, 28, 1)),
            tf.keras.layers.MaxPooling2D(),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(10, activation='softmax')
        ])
        model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy,
                      optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
                      metrics=['accuracy'])
        return model

    # single node
    # single_worker_model = build_and_compile_cnn_model()
    # single_worker_model.fit(x=train_datasets, epochs=3)

    tf_feed = TFNode.DataFeed(ctx.mgr, False)

    def rdd_generator():
        while not tf_feed.should_stop():
            batch = tf_feed.next_batch(1)
            if len(batch) > 0:
                example = batch[0]
                image = np.array(example[0]).astype(np.float32) / 255.0
                image = np.reshape(image, (28, 28, 1))
                label = np.array(example[1]).astype(np.float32)
                label = np.reshape(label, (1, ))
                yield (image, label)
            else:
                return

    ds = tf.data.Dataset.from_generator(
        rdd_generator, (tf.float32, tf.float32),
        (tf.TensorShape([28, 28, 1]), tf.TensorShape([1])))
    ds = ds.batch(args.batch_size)

    # this fails
    # callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=args.model_dir)]
    tf.io.gfile.makedirs(args.model_dir)
    filepath = args.model_dir + "/weights-{epoch:04d}"
    callbacks = [
        tf.keras.callbacks.ModelCheckpoint(filepath=filepath,
                                           verbose=1,
                                           save_weights_only=True)
    ]

    with strategy.scope():
        multi_worker_model = build_and_compile_cnn_model()

    # Note: MultiWorkerMirroredStrategy (CollectiveAllReduceStrategy) is synchronous,
    # so we need to ensure that all workers complete training before any of them run out of data from the RDD.
    # And given that Spark RDD partitions (and partition sizes) can be non-evenly divisible by num_workers,
    # we'll just stop training at 90% of the total expected number of steps.
    steps_per_epoch = 60000 / args.batch_size
    steps_per_epoch_per_worker = steps_per_epoch / ctx.num_workers
    max_steps_per_worker = steps_per_epoch_per_worker * 0.9

    multi_worker_model.fit(x=ds,
                           epochs=args.epochs,
                           steps_per_epoch=max_steps_per_worker,
                           callbacks=callbacks)

    from tensorflow_estimator.python.estimator.export import export_lib
    export_dir = export_lib.get_timestamped_export_dir(args.export_dir)
    compat.export_saved_model(multi_worker_model, export_dir,
                              ctx.job_name == 'chief')

    # terminating feed tells spark to skip processing further partitions
    tf_feed.terminate()
def map_fun(args, ctx):
  from tensorflowonspark import TFNode
  from datetime import datetime
  import math
  import numpy
  import tensorflow as tf
  import time

  worker_num = ctx.worker_num
  job_name = ctx.job_name
  task_index = ctx.task_index

  IMAGE_PIXELS = 28

  # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
  if job_name == "ps":
    time.sleep((worker_num + 1) * 5)

  # Parameters
  hidden_units = 128
  batch_size = args.batch_size

  # Get TF cluster and server instances
  cluster, server = TFNode.start_cluster_server(ctx, 1, args.protocol == 'rdma')

  def feed_dict(batch):
    # Convert from dict of named arrays to two numpy arrays of the proper type
    images = batch['image']
    labels = batch['label']
    xs = numpy.array(images)
    xs = xs.astype(numpy.float32)
    xs = xs / 255.0
    ys = numpy.array(labels)
    ys = ys.astype(numpy.uint8)
    return (xs, ys)

  if job_name == "ps":
    server.join()
  elif job_name == "worker":

    # Assigns ops to the local worker by default.
    with tf.device(tf.train.replica_device_setter(
      worker_device="/job:worker/task:%d" % task_index,
      cluster=cluster)):

      # Variables of the hidden layer
      hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
                          stddev=1.0 / IMAGE_PIXELS), name="hid_w")
      hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
      tf.summary.histogram("hidden_weights", hid_w)

      # Variables of the softmax layer
      sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
                         stddev=1.0 / math.sqrt(hidden_units)), name="sm_w")
      sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
      tf.summary.histogram("softmax_weights", sm_w)

      # Placeholders or QueueRunner/Readers for input data
      x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x")
      y_ = tf.placeholder(tf.float32, [None, 10], name="y_")

      x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1])
      tf.summary.image("x_img", x_img)

      hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
      hid = tf.nn.relu(hid_lin)

      y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))

      global_step = tf.Variable(0)

      loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
      tf.summary.scalar("loss", loss)

      train_op = tf.train.AdagradOptimizer(0.01).minimize(
          loss, global_step=global_step)

      # Test trained model
      label = tf.argmax(y_, 1, name="label")
      prediction = tf.argmax(y, 1, name="prediction")
      correct_prediction = tf.equal(prediction, label)

      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
      tf.summary.scalar("acc", accuracy)

      saver = tf.train.Saver()
      summary_op = tf.summary.merge_all()
      init_op = tf.global_variables_initializer()

    # Create a "supervisor", which oversees the training process and stores model state into HDFS
    logdir = TFNode.hdfs_path(ctx, args.model_dir)
    print("tensorflow model path: {0}".format(logdir))
    summary_writer = tf.summary.FileWriter("tensorboard_%d" % (worker_num), graph=tf.get_default_graph())

    sv = tf.train.Supervisor(is_chief=(task_index == 0),
                             logdir=logdir,
                             init_op=init_op,
                             summary_op=None,
                             saver=saver,
                             global_step=global_step,
                             stop_grace_secs=300,
                             save_model_secs=10)

    # The supervisor takes care of session initialization, restoring from
    # a checkpoint, and closing when done or an error occurs.
    with sv.managed_session(server.target) as sess:
      print("{0} session ready".format(datetime.now().isoformat()))

      # Loop until the supervisor shuts down or 1000000 steps have completed.
      step = 0
      tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping)
      while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
        # Run a training step asynchronously.
        # See `tf.train.SyncReplicasOptimizer` for additional details on how to
        # perform *synchronous* training.

        # using feed_dict
        batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
        feed = {x: batch_xs, y_: batch_ys}

        if len(batch_xs) > 0:
          _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
          # print accuracy and save model checkpoint to HDFS every 100 steps
          if (step % 100 == 0):
            print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy, {x: batch_xs, y_: batch_ys})))

          if sv.is_chief:
            summary_writer.add_summary(summary, step)

      if sv.should_stop() or step >= args.steps:
        tf_feed.terminate()

      if sv.is_chief and args.export_dir:
        print("{0} exporting saved_model to: {1}".format(datetime.now().isoformat(), args.export_dir))
        # exported signatures defined in code
        signatures = {
          tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: {
            'inputs': {'image': x},
            'outputs': {'prediction': prediction},
            'method_name': tf.saved_model.signature_constants.PREDICT_METHOD_NAME
          },
          'featurize': {
            'inputs': {'image': x},
            'outputs': {'features': hid},
            'method_name': 'featurize'
          }
        }
        TFNode.export_saved_model(sess,
                                  args.export_dir,
                                  tf.saved_model.tag_constants.SERVING,
                                  signatures)
      else:
        # non-chief workers should wait for chief
        while not sv.should_stop():
          print("Waiting for chief")
          time.sleep(5)

    # Ask for all the services to stop.
    print("{0} stopping supervisor".format(datetime.now().isoformat()))
    sv.stop()
예제 #13
0
def main_fun(args, ctx):
    IMAGE_PIXELS = 28
    num_classes = 10

    # use Keras API to load data
    from tensorflow.python.keras.datasets import mnist
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train = x_train.reshape(60000, 784)
    x_test = x_test.reshape(10000, 784)
    x_train = x_train.astype('float32') / 255
    x_test = x_test.astype('float32') / 255

    # convert class vectors to binary class matrices
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)

    # setup a Keras model
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(784, )))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(10, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer=tf.train.RMSPropOptimizer(learning_rate=0.001),
                  metrics=['accuracy'])
    model.summary()

    print("model.inputs: {}".format(model.inputs))
    print("model.outputs: {}".format(model.outputs))

    # convert Keras model to tf.estimator
    estimator = tf.keras.estimator.model_to_estimator(model,
                                                      model_dir=args.model_dir)

    # setup train_input_fn for InputMode.TENSORFLOW or InputMode.SPARK
    if args.input_mode == 'tf':
        # For InputMode.TENSORFLOW, just use data in memory
        train_input_fn = tf.estimator.inputs.numpy_input_fn(
            x={"dense_input": x_train},
            y=y_train,
            batch_size=128,
            num_epochs=args.epochs,
            shuffle=True)

        hooks = []
    else:  # 'spark'
        # For InputMode.SPARK, read data from RDD
        tf_feed = TFNode.DataFeed(ctx.mgr)

        def rdd_generator():
            while not tf_feed.should_stop():
                batch = tf_feed.next_batch(1)
                if len(batch) > 0:
                    record = batch[0]
                    image = numpy.array(record[0]).astype(
                        numpy.float32) / 255.0
                    label = numpy.array(record[1]).astype(numpy.float32)
                    yield (image, label)
                else:
                    return

        def train_input_fn():
            ds = tf.data.Dataset.from_generator(
                rdd_generator, (tf.float32, tf.float32), (tf.TensorShape(
                    [IMAGE_PIXELS * IMAGE_PIXELS]), tf.TensorShape([10])))
            ds = ds.batch(args.batch_size)
            return ds

        # add a hook to terminate the RDD data feed when the session ends
        hooks = [StopFeedHook(tf_feed)]

    # eval_input_fn ALWAYS uses data loaded in memory, since InputMode.SPARK can only feed one RDD at a time
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"dense_input": x_test}, y=y_test, num_epochs=1, shuffle=False)

    # setup tf.estimator.train_and_evaluate() w/ FinalExporter
    feature_spec = {
        'dense_input': tf.placeholder(tf.float32, shape=[None, 784])
    }
    exporter = tf.estimator.FinalExporter(
        "serving",
        serving_input_receiver_fn=tf.estimator.export.
        build_raw_serving_input_receiver_fn(feature_spec))
    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                        max_steps=args.steps,
                                        hooks=hooks)
    eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn,
                                      exporters=exporter)

    # train and export model
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

    # WORKAROUND FOR https://github.com/tensorflow/tensorflow/issues/21745
    # wait for all other nodes to complete (via done files)
    done_dir = "{}/done".format(ctx.absolute_path(args.model_dir))
    print("Writing done file to: {}".format(done_dir))
    tf.gfile.MakeDirs(done_dir)
    with tf.gfile.GFile("{}/{}".format(done_dir, ctx.task_index),
                        'w') as done_file:
        done_file.write("done")

    for i in range(60):
        if len(tf.gfile.ListDirectory(done_dir)) < len(
                ctx.cluster_spec['worker']):
            print("{} Waiting for other nodes {}".format(
                datetime.now().isoformat(), i))
            time.sleep(1)
        else:
            print("{} All nodes done".format(datetime.now().isoformat()))
            break
예제 #14
0
def main_fun(args, ctx):
    IMAGE_PIXELS = 28
    num_classes = 10

    # use Keras API to load data
    from tensorflow.python.keras.datasets import mnist
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train = x_train.reshape(60000, 784)
    x_test = x_test.reshape(10000, 784)
    x_train = x_train.astype('float32') / 255
    x_test = x_test.astype('float32') / 255

    # convert class vectors to binary class matrices
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)

    # setup a Keras model
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(784, )))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(10, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer=RMSprop(),
                  metrics=['accuracy'])
    model.summary()

    # convert Keras model to tf.estimator
    estimator = tf.keras.estimator.model_to_estimator(model,
                                                      model_dir=args.model_dir)

    # setup train_input_fn for InputMode.TENSORFLOW or InputMode.SPARK
    if args.input_mode == 'tf':
        train_input_fn = tf.estimator.inputs.numpy_input_fn(
            x={"dense_1_input": x_train},
            y=y_train,
            batch_size=128,
            num_epochs=None,
            shuffle=True)
    else:  # 'spark'
        tf_feed = TFNode.DataFeed(ctx.mgr)

        def rdd_generator():
            while not tf_feed.should_stop():
                batch = tf_feed.next_batch(1)
                if len(batch) > 0:
                    record = batch[0]
                    image = numpy.array(record[0]).astype(
                        numpy.float32) / 255.0
                    label = numpy.array(record[1]).astype(numpy.float32)
                    yield (image, label)

        def train_input_fn():
            ds = tf.data.Dataset.from_generator(
                rdd_generator, (tf.float32, tf.float32), (tf.TensorShape(
                    [IMAGE_PIXELS * IMAGE_PIXELS]), tf.TensorShape([10])))
            ds = ds.batch(args.batch_size)
            return ds

    # eval_input_fn ALWAYS uses data loaded in memory, since InputMode.SPARK can only feed one RDD at a time
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"dense_1_input": x_test},
        y=y_test,
        num_epochs=args.epochs,
        shuffle=False)

    # setup tf.estimator.train_and_evaluate()
    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                        max_steps=args.steps)
    eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

    # export a saved_model, if export_dir provided
    if args.export_dir:

        def serving_input_receiver_fn():
            """An input receiver that expects a serialized tf.Example."""
            serialized_tf_example = tf.placeholder(dtype=tf.string,
                                                   shape=[args.batch_size],
                                                   name='input_example_tensor')
            receiver_tensors = {'dense_1_input': serialized_tf_example}
            feature_spec = {
                'dense_1_input': tf.FixedLenFeature(784, tf.string)
            }
            features = tf.parse_example(serialized_tf_example, feature_spec)
            return tf.estimator.export.ServingInputReceiver(
                features, receiver_tensors)

        estimator.export_savedmodel(args.export_dir, serving_input_receiver_fn)
예제 #15
0
def main_fun(args, ctx):
    import numpy as np
    import tensorflow as tf
    import tensorflow_datasets as tfds
    from tensorflowonspark import TFNode

    tfds.disable_progress_bar()

    class StopFeedHook(tf.estimator.SessionRunHook):
        """SessionRunHook to terminate InputMode.SPARK RDD feeding if the training loop exits before the entire RDD is consumed."""
        def __init__(self, feed):
            self.feed = feed

        def end(self, session):
            self.feed.terminate()
            self.feed.next_batch(1)

    BATCH_SIZE = args.batch_size
    LEARNING_RATE = args.learning_rate

    tf_feed = TFNode.DataFeed(ctx.mgr)

    def rdd_generator():
        while not tf_feed.should_stop():
            batch = tf_feed.next_batch(1)
            if len(batch) > 0:
                example = batch[0]
                image = np.array(example[0]).astype(np.float32) / 255.0
                image = np.reshape(image, (28, 28, 1))
                label = np.array(example[1]).astype(np.float32)
                label = np.reshape(label, (1, ))
                yield (image, label)
            else:
                return

    def input_fn(mode, input_context=None):
        if mode == tf.estimator.ModeKeys.TRAIN:
            # Note: Spark is responsible for sharding/repeating/shuffling the data via RDD
            ds = tf.data.Dataset.from_generator(
                rdd_generator, (tf.float32, tf.float32),
                (tf.TensorShape([28, 28, 1]), tf.TensorShape([1])))
            return ds.batch(BATCH_SIZE)
        else:
            # read evaluation data from tensorflow_datasets directly
            def scale(image, label):
                image = tf.cast(image, tf.float32) / 255.0
                return image, label

            mnist = tfds.load(name='mnist', with_info=True, as_supervised=True)
            ds = mnist['test']
            if input_context:
                ds = ds.shard(input_context.num_input_pipelines,
                              input_context.input_pipeline_id)
            return ds.map(scale).batch(BATCH_SIZE)

    def serving_input_receiver_fn():
        features = tf.compat.v1.placeholder(dtype=tf.float32,
                                            shape=[None, 28, 28, 1],
                                            name='conv2d_input')
        receiver_tensors = {'conv2d_input': features}
        return tf.estimator.export.ServingInputReceiver(
            receiver_tensors, receiver_tensors)

    def model_fn(features, labels, mode):
        model = tf.keras.Sequential([
            tf.keras.layers.Conv2D(32,
                                   3,
                                   activation='relu',
                                   input_shape=(28, 28, 1)),
            tf.keras.layers.MaxPooling2D(),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(10, activation='softmax')
        ])
        logits = model(features, training=False)

        if mode == tf.estimator.ModeKeys.PREDICT:
            predictions = {'logits': logits}
            return tf.estimator.EstimatorSpec(mode, predictions=predictions)

        optimizer = tf.compat.v1.train.GradientDescentOptimizer(
            learning_rate=LEARNING_RATE)
        loss = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction=tf.keras.losses.Reduction.NONE)(labels,
                                                                        logits)
        loss = tf.reduce_sum(input_tensor=loss) * (1. / BATCH_SIZE)
        if mode == tf.estimator.ModeKeys.EVAL:
            return tf.estimator.EstimatorSpec(mode, loss=loss)

        return tf.estimator.EstimatorSpec(
            mode=mode,
            loss=loss,
            train_op=optimizer.minimize(
                loss, tf.compat.v1.train.get_or_create_global_step()))

    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
    config = tf.estimator.RunConfig(train_distribute=strategy,
                                    save_checkpoints_steps=100)

    classifier = tf.estimator.Estimator(model_fn=model_fn,
                                        model_dir=args.model_dir,
                                        config=config)

    # exporter = tf.estimator.FinalExporter("serving", serving_input_receiver_fn=serving_input_receiver_fn)

    # Note: MultiWorkerMirroredStrategy (CollectiveAllReduceStrategy) is synchronous,
    # so we need to ensure that all workers complete training before any of them run out of data from the RDD.
    # And given that Spark RDD partitions (and partition sizes) can be non-evenly divisible by num_workers,
    # we'll just stop training at 90% of the total expected number of steps.
    steps = 60000 * args.epochs / args.batch_size
    steps_per_worker = steps / ctx.num_workers
    max_steps_per_worker = steps_per_worker * 0.9

    tf.estimator.train_and_evaluate(
        classifier,
        train_spec=tf.estimator.TrainSpec(input_fn=input_fn,
                                          max_steps=max_steps_per_worker,
                                          hooks=[StopFeedHook(tf_feed)]),
        eval_spec=tf.estimator.EvalSpec(input_fn=input_fn)
        # eval_spec=tf.estimator.EvalSpec(input_fn=input_fn, exporters=exporter)
    )

    if ctx.job_name == 'chief':
        print("Exporting saved_model to {}".format(args.export_dir))
        classifier.export_saved_model(args.export_dir,
                                      serving_input_receiver_fn)
def map_fun(args, ctx):
    from tensorflowonspark import TFNode
    from datetime import datetime
    import math
    import numpy
    import tensorflow as tf

    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index

    # Parameters
    IMAGE_PIXELS = 28
    hidden_units = 128

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

    # Create generator for Spark data feed
    tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")

    def rdd_generator():
        while not tf_feed.should_stop():
            batch = tf_feed.next_batch(1)[0]
            image = numpy.array(batch[0])
            image = image.astype(numpy.float32) / 255.0
            label = numpy.array(batch[1])
            label = label.astype(numpy.int64)
            yield (image, label)

    if job_name == "ps":
        server.join()
    elif job_name == "worker":

        # Assigns ops to the local worker by default.
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % task_index,
                    cluster=cluster)):

            # Dataset for input data
            ds = tf.data.Dataset.from_generator(
                rdd_generator, (tf.float32, tf.float32),
                (tf.TensorShape([IMAGE_PIXELS * IMAGE_PIXELS]),
                 tf.TensorShape([10]))).batch(args.batch_size)
            iterator = ds.make_one_shot_iterator()
            x, y_ = iterator.get_next()

            # Variables of the hidden layer
            hid_w = tf.Variable(tf.truncated_normal(
                [IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
                stddev=1.0 / IMAGE_PIXELS),
                                name="hid_w")
            hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
            tf.summary.histogram("hidden_weights", hid_w)

            # Variables of the softmax layer
            sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
                                                   stddev=1.0 /
                                                   math.sqrt(hidden_units)),
                               name="sm_w")
            sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
            tf.summary.histogram("softmax_weights", sm_w)

            # # Placeholders or QueueRunner/Readers for input data
            # x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x")
            # y_ = tf.placeholder(tf.float32, [None, 10], name="y_")

            x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1])
            tf.summary.image("x_img", x_img)

            hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
            hid = tf.nn.relu(hid_lin)

            y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))

            global_step = tf.Variable(0)

            loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
            tf.summary.scalar("loss", loss)

            train_op = tf.train.AdagradOptimizer(0.01).minimize(
                loss, global_step=global_step)

            # Test trained model
            label = tf.argmax(y_, 1, name="label")
            prediction = tf.argmax(y, 1, name="prediction")
            correct_prediction = tf.equal(prediction, label)

            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32),
                                      name="accuracy")
            tf.summary.scalar("acc", accuracy)

            saver = tf.train.Saver()
            summary_op = tf.summary.merge_all()
            init_op = tf.global_variables_initializer()

        # Create a "supervisor", which oversees the training process and stores model state into HDFS
        logdir = TFNode.hdfs_path(ctx, args.model)
        print("tensorflow model path: {0}".format(logdir))
        summary_writer = tf.summary.FileWriter("tensorboard_%d" % worker_num,
                                               graph=tf.get_default_graph())

        if args.mode == "train":
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     init_op=init_op,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=10)
        else:
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=0)

        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
        with sv.managed_session(server.target) as sess:
            print("{0} session ready".format(datetime.now().isoformat()))

            # Loop until the supervisor shuts down or 1000000 steps have completed.
            step = 0
            while not sv.should_stop() and not tf_feed.should_stop(
            ) and step < args.steps:
                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.

                if args.mode == "train":
                    _, summary, step = sess.run(
                        [train_op, summary_op, global_step])
                    # print accuracy and save model checkpoint to HDFS every 100 steps
                    if (step % 100 == 0):
                        print("{0} step: {1} accuracy: {2}".format(
                            datetime.now().isoformat(), step,
                            sess.run(accuracy)))

                    if sv.is_chief:
                        summary_writer.add_summary(summary, step)
                else:  # args.mode == "inference"
                    labels, preds, acc = sess.run(
                        [label, prediction, accuracy])

                    results = [
                        "{0} Label: {1}, Prediction: {2}".format(
                            datetime.now().isoformat(), l, p)
                        for l, p in zip(labels, preds)
                    ]
                    tf_feed.batch_results(results)
                    print("acc: {0}".format(acc))

            if sv.should_stop() or step >= args.steps:
                tf_feed.terminate()

        # Ask for all the services to stop.
        print("{0} stopping supervisor".format(datetime.now().isoformat()))
        sv.stop()
예제 #17
0
def map_fun(args, ctx):
  # from com.yahoo.ml.tf import TFNode
  from tensorflowonspark import TFNode
  from datetime import datetime
  import math
  import numpy
  import tensorflow as tf
  import time

  worker_num = ctx.worker_num #worker数量
  job_name = ctx.job_name # job名
  task_index = ctx.task_index # 任务索引
  cluster_spec = ctx.cluster_spec # 集群

  IMAGE_PIXELS=10 # 图像大小 mnist 28x28x1  (后续参考自己图像大小进行修改)
  channels=3
  num_class=2
  dropout = 0.5

  learning_rate=1e-6
  # Parameters
  hidden_units = 128 # NN隐藏层
  training_epochs=args.epochs
  img_nums=630000
  #batch_size   = args.batch_size #每批次训练的样本数
  batch_size=200
  """
  # ---------设置动态学习效率
  # Constants describing the training process.
  # MOVING_AVERAGE_DECAY = 0.9999     # The decay to use for the moving average.
  NUM_EPOCHS_PER_DECAY = batch_size  # Epochs after which learning rate decays.
  LEARNING_RATE_DECAY_FACTOR = 0.1  # Learning rate decay factor.
  INITIAL_LEARNING_RATE = 0.1  # Initial learning rate.

  global_step1 = training_epochs * (img_nums // batch_size)  # Integer Variable counting the number of training steps
  # Variables that affect learning rate.
  num_batches_per_epoch = img_nums / batch_size
  decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)

  # Decay the learning rate exponentially based on the number of steps.
  learning_rate = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                            global_step1,
                                            decay_steps,
                                            LEARNING_RATE_DECAY_FACTOR,
                                            staircase=True)
  # 设置动态学习效率----------
  """
  
  # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
  if job_name == "ps": # ps节点(主节点)
    time.sleep((worker_num + 1) * 5)

  # Get TF cluster and server instances
  cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

  def feed_dict(batch):
    # Convert from [(images, labels)] to two numpy arrays of the proper type
    images = []
    labels = []
    numpy.random.shuffle(batch) # 随机打乱
    for item in batch:
      images.append(item[0])
      labels.append(item[1])
    xs = numpy.array(images)
    xs = xs.astype(numpy.float32)
    #xs = xs/255.0 # 数据归一化
    # Z-score标准化方法
    #mean = numpy.reshape(numpy.average(xs, 1), [numpy.shape(xs)[0], 1])
    #std = numpy.reshape(numpy.std(xs, 1), [numpy.shape(xs)[0], 1])
    #xs = (xs - mean) / std

    # min-max标准化(Min-Max Normalization
    max_=numpy.reshape(numpy.max(xs,1),[numpy.shape(xs)[0], 1])
    min_ = numpy.reshape(numpy.min(xs, 1), [numpy.shape(xs)[0], 1])

    xs=(xs-min_)/(max_-min_)
    
    
    ys = numpy.array(labels)
    ys = ys.astype(numpy.uint8)
    return (xs, ys)

  if job_name == "ps":
    server.join()
  elif job_name == "worker":

    # Assigns ops to the local worker by default.
    with tf.device(tf.train.replica_device_setter(
        worker_device="/job:worker/task:%d" % task_index,
        cluster=cluster)):

      # Create some wrappers for simplicity
      def conv2d(x, W, b, strides=1):
        # Conv2D wrapper, with bias and relu activation
        x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME')
        x = tf.nn.bias_add(x, b)  # strides中间两个为1 表示x,y方向都不间隔取样
        return tf.nn.relu(x)

      def maxpool2d(x, k=2):
        # MaxPool2D wrapper
        return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
                              padding='SAME')  # strides中间两个为2 表示x,y方向都间隔1个取样

      def maxpool2d2(x, k=2):
        # MaxPool2D wrapper
        return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
                              padding='VALID')  # strides中间两个为2 表示x,y方向都间隔1个取样

      # Store layers weight & bias
      weights = {
          # 5x5 conv, 3 input, 32 outputs 彩色图像3个输入(3个频道),灰度图像1个输入
          'wc1': tf.get_variable('wc1',[3,3,channels,64],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),  # 5X5的卷积模板

          # 5x5 conv, 32 inputs, 64 outputs
          'wc2': tf.get_variable('wc2',[3,3,64,128],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),
          # 'wc3': tf.Variable(tf.random_normal([3, 3, 256, 128])),
          'wc4': tf.get_variable('wc4',[3,3,128,num_class],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),
          # fully connected, 7*7*64 inputs, 1024 outputs
          # 'wd1': tf.Variable(tf.random_normal([(1+IMAGE_PIXELS // 4) * (1+IMAGE_PIXELS // 4) * 64, 1024])),
          # 1024 inputs, 10 outputs (class prediction)
          # 'out': tf.Variable(tf.random_normal([1024, num_class]))
      }

      biases = {
          'bc1': tf.get_variable('bc1',[64],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),
          'bc2': tf.get_variable('bc2',[128],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),
          # 'bc3': tf.Variable(tf.random_normal([128])),
          'bc4': tf.get_variable('bc4',[num_class],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),
          # 'bd1': tf.Variable(tf.random_normal([1024])),
          # 'out': tf.Variable(tf.random_normal([num_class]))
      }

      # Placeholders or QueueRunner/Readers for input data
      x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS * channels], name="x")  # mnist 28*28*1
      y_ = tf.placeholder(tf.float32, [None, num_class], name="y_")
      # keep=tf.placeholder(tf.float32)

      x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, channels])  # mnist 数据 28x28x1 (灰度图 波段为1)
      # tf.summary.image("x_img", x_img)

      # 改成卷积模型
      conv1 = conv2d(x_img, weights['wc1'], biases['bc1'])
      conv1 = maxpool2d(conv1, k=2)
      # conv1 = tf.nn.dropout(conv1, keep)
      conv2 = conv2d(conv1, weights['wc2'], biases['bc2'])
      conv2 = maxpool2d(conv2, k=2)
      conv2 = tf.nn.dropout(conv2, dropout)
      # conv3 = conv2d(conv2, weights['wc3'], biases['bc3'])
      # conv3 = tf.nn.dropout(conv3, keep)
      conv4 = conv2d(conv2, weights['wc4'], biases['bc4'])
      conv4 = maxpool2d2(conv4, k=2)
      y = tf.reshape(conv4, [-1, num_class])


      # fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]])
      # fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
      # fc1 = tf.nn.relu(fc1)
      # if args.mode == "train" or args.mode == "retrain":
      #   fc1 = tf.nn.dropout(fc1, dropout)
      # y = tf.add(tf.matmul(fc1, weights['out']), biases['out'])

 
      # global_step = tf.Variable(0)

      global_step = tf.Variable(0, name="global_step", trainable=False)

      # loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))

      loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_,logits=y))

      # tf.summary.scalar("loss", loss)
      train_op = tf.train.AdagradOptimizer(learning_rate).minimize(
          loss, global_step=global_step)


      # Test trained model
      label = tf.argmax(y_, 1, name="label")
      prediction = tf.argmax(y, 1,name="prediction")
      correct_prediction = tf.equal(prediction, label)

      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
      # tf.summary.scalar("acc", accuracy)

      saver = tf.train.Saver()
      # summary_op = tf.summary.merge_all()
      init_op = tf.global_variables_initializer()


    # Create a "supervisor", which oversees the training process and stores model state into HDFS
    logdir = TFNode.hdfs_path(ctx, args.model)
    print("tensorflow model path: {0}".format(logdir)) #
    # log.info("tensorflow model path: {0}".format(logdir))
    # summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

    if args.mode == "train":
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               init_op=init_op,
                               # summary_op=None,
                               saver=saver,
                               # recovery_wait_secs=1,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=1)
    elif args.mode == "retrain":
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               # init_op=init_op,
                               # summary_op=None,
                               saver=saver,
                               # recovery_wait_secs=1,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=10)
    else:
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               # summary_op=None,
                               saver=saver,
                               # recovery_wait_secs=1,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=0)

    # The supervisor takes care of session initialization, restoring from
    # a checkpoint, and closing when done or an error occurs.
    with sv.managed_session(server.target) as sess: # 打开session

      print("{0} session ready".format(datetime.now().isoformat()))
      # log.info("{0} session ready".format(datetime.now().isoformat()))
      # Loop until the supervisor shuts down or 1000000 steps have completed.
      step = 0
      tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train" or args.mode == "retrain")
      while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
        # Run a training step asynchronously.
        # See `tf.train.SyncReplicasOptimizer` for additional details on how to
        # perform *synchronous* training.

        # using feed_dict
        batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
        feed = {x: batch_xs, y_: batch_ys}

        if len(batch_xs) > 0:
          if args.mode == "train" or args.mode == "retrain":
            # _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
            _, step = sess.run([train_op,  global_step], feed_dict=feed)
            # print accuracy and save model checkpoint to HDFS every 100 steps
            if (step % 100 == 0):
              print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys})))
              # log.info("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys})))
            if sv.is_chief:
              pass
              # summary_writer.add_summary(summary, step)
          else: # args.mode == "inference"
            labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed)

            results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)]
            tf_feed.batch_results(results)
            print("acc: {0}".format(acc))
            # log.info("acc: {0}".format(acc))
      if sv.should_stop() or step >= args.steps:
        tf_feed.terminate()

    # Ask for all the services to stop.
    print("{0} stopping supervisor".format(datetime.now().isoformat()))
    sv.stop()
def map_fun(args, ctx):
  from tensorflowonspark import TFNode
  from datetime import datetime
  import numpy
  import tensorflow as tf
  import time
  import math

  worker_num = ctx.worker_num
  job_name = ctx.job_name
  task_index = ctx.task_index
  cluster_spec = ctx.cluster_spec

  # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
  if job_name == "ps":
    time.sleep((worker_num + 1) * 5)

  # Parameters
  batch_size   = args.batch_size

  # Get TF cluster and server instances
  cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

  def feed_dict(batch):
    # Convert from [images_labels] to two numpy arrays of the proper type
    images = []
    labels = []
    for item in batch:
      images.append(item[0: 4])
      labels.append(item[4])
    xs = numpy.array(images)
    xs = xs.astype(numpy.float32)
    ys = dense_to_one_hot(numpy.array(labels, dtype=numpy.uint), 3)
    ys = ys.astype(numpy.uint8)
    return (xs, ys)

  def dense_to_one_hot(labels_dense, num_classes):
    """Convert class labels from scalars to one-hot vectors."""
    num_labels = labels_dense.shape[0]
    index_offset = numpy.arange(num_labels) * num_classes
    labels_one_hot = numpy.zeros((num_labels, num_classes))
    tt = index_offset + labels_dense.ravel()
    tt = tt.astype(numpy.int32)
    labels_one_hot.flat[tt] = 1
    return labels_one_hot

  if job_name == "ps":
    server.join()
  elif job_name == "worker":

    # Assigns ops to the local worker by default.
    with tf.device(tf.train.replica_device_setter(
        worker_device="/job:worker/task:%d" % task_index,
        cluster=cluster)):

        # network
        x = tf.placeholder(tf.float32, [None, 4])

        # paras
        W = tf.Variable(tf.zeros([4, 3]))
        b = tf.Variable(tf.zeros([3]))

        y = tf.nn.softmax(tf.matmul(x, W) + b)
        y_ = tf.placeholder(tf.float32, [None, 3])

        # loss func
        cross_entropy = -tf.reduce_sum(y_ * tf.log(y))

        global_step = tf.Variable(0)

        train_op = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy,global_step=global_step )

        # Test trained model
        label = tf.argmax(y_, 1, name="label") #??? does the function argmax use in the right way ?
        prediction = tf.argmax(y, 1, name="prediction")
        correct_prediction = tf.equal(prediction, label)

        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
        tf.summary.scalar("acc", accuracy)

        saver = tf.train.Saver()
        summary_op = tf.summary.merge_all()
        init_op = tf.global_variables_initializer()
    # Create a "supervisor", which oversees the training process and stores model state into HDFS
    logdir = TFNode.hdfs_path(ctx, args.model)
    print("tensorflow model path: {0}".format(logdir))
    summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

    if args.mode == "train":
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               init_op=init_op,
                               summary_op=None,
                               saver=saver,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=1)

    # The supervisor takes care of session initialization, restoring from
    # a checkpoint, and closing when done or an error occurs.
    with sv.managed_session(server.target) as sess:
      print("{0} session ready".format(datetime.now().isoformat()))

      # Loop until the supervisor shuts down or 1000000 steps have completed.
      step = 0
      tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")
      while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
        # Run a training step asynchronously.
        # See `tf.train.SyncReplicasOptimizer` for additional details on how to
        # perform *synchronous* training.

        # using feed_dict
        batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
        feed = {x: batch_xs, y_: batch_ys}

        if len(batch_xs) > 0:
          if args.mode == "train":
            _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
            # print accuracy and save model checkpoint to HDFS every 100 steps
            if (step % 100 == 0):
              print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step,
                                                         sess.run(accuracy,{x: batch_xs, y_: batch_ys})))

            if sv.is_chief:
              summary_writer.add_summary(summary, step)


      if sv.should_stop() or step >= args.steps:
        tf_feed.terminate()

    # Ask for all the services to stop.
    print("{0} stopping supervisor".format(datetime.now().isoformat()))
    sv.stop()
예제 #19
0
def main_fun(args, ctx):
    iris = datasets.load_iris()
    X = iris.data
    Y = iris.target

    X = preprocessing.scale(X)
    # Y = to_categorical(Y, num_classes=3)

    train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.2)
    print(train_X.shape, test_X.shape, train_Y.shape, test_Y.shape)

    model = Sequential()
    model.add(Dense(12, input_shape=(4,), activation='relu'))
    model.add(Dense(3, input_shape=(12,), activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='SGD', metrics=['accuracy'])
    model.summary()

    estimator = tf.keras.estimator.model_to_estimator(model, model_dir=args.model_dir)
#     model.fit(train_X, train_Y, nb_epoch=50, batch_size=1, verbose=1)

#     loss, accuracy = model.evaluate(test_X, test_Y, verbose=0)
#     print("Accuracy = {:.2f}".format(accuracy))
    tf_feed = TFNode.DataFeed(ctx.mgr)
    def rdd_generator():
        while not tf_feed.should_stop():
            batch = tf_feed.next_batch(1)
            if len(batch) > 0:
                record = batch[0]
                features = np.array(record[0]).astype(numpy.array)
                label = np.array(record[1]).astype(numpy.float32)
                yield (features, label)

    def train_input_fn():
        ds = tf.data.Dataset.from_generator(rdd_generator,
                                           (tf.array, tf.float32),
                                           (tf.TensorShape([4]), tf.TensorShape([3])))
        ds = ds.batch(args.batch_size)
        return ds

#     train_input_fn = tf.estimator.inputs.numpy_input_fn(
#         x={"dense_input": train_X},
#         y=train_Y,
#         batch_size=1,
#         num_epochs=None,
#         shuffle=True
#     )
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"dense_input": test_X},
        y=test_Y,
        num_epochs=args.epochs,
        shuffle=False
    )

    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=args.steps)
    eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

    test_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"dense_input": test_X[:1]},
        y=test_Y[:1],
        batch_size=1,
        shuffle=False
    )
예제 #20
0
def map_fun(args, ctx):
	from tensorflowonspark import TFNode
	from datetime import datetime
	import math
	import numpy
	import tensorflow as tf
	import time

	worker_num = ctx.worker_num
	job_name = ctx.job_name
	task_index = ctx.task_index
	cluster_spec = ctx.cluster_spec

	IMAGE_PIXELS=28

	# Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
	if job_name == "ps":
		time.sleep((worker_num + 1) * 5)

	# Parameters
	hidden_units = 128
	batch_size   = args.batch_size

	# Get TF cluster and server instances
	cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)
	
	def writeFileToHDFS():
		rootdir = '/tmp/mnist_model'
		client = HdfsClient(hosts='localhost:50070')
		client.mkdirs('/user/root/mnist_model')
		for parent,dirnames,filenames in os.walk(rootdir):
			for dirname in  dirnames:
				print("parent is:{0}".format(parent))
		for filename in filenames:
			client.copy_from_local(os.path.join(parent,filename), os.path.join('/user/root/mnist_model',filename), overwrite=True)


	def feed_dict(batch):
		# Convert from [(images, labels)] to two numpy arrays of the proper type
		images = []
		labels = []
		for item in batch:
			images.append(item[0])
			labels.append(item[1])
		xs = numpy.array(images)
		xs = xs.astype(numpy.float32)
		xs = xs/255.0
		ys = numpy.array(labels)
		ys = ys.astype(numpy.uint8)
		return (xs, ys)

	if job_name == "ps":
		server.join()
	elif job_name == "worker":

		# Assigns ops to the local worker by default.
		with tf.device(tf.train.replica_device_setter(
			worker_device="/job:worker/task:%d" % task_index,
			cluster=cluster)):

			# Variables of the hidden layer
			hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
							stddev=1.0 / IMAGE_PIXELS), name="hid_w")
			hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
			tf.summary.histogram("hidden_weights", hid_w)

			# Variables of the softmax layer
			sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
							stddev=1.0 / math.sqrt(hidden_units)), name="sm_w")
			sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
			tf.summary.histogram("softmax_weights", sm_w)

			# Placeholders or QueueRunner/Readers for input data
			x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x")
			y_ = tf.placeholder(tf.float32, [None, 10], name="y_")

			x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1])
			tf.summary.image("x_img", x_img)

			hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
			hid = tf.nn.relu(hid_lin)

			y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))

			global_step = tf.Variable(0)

			loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
			tf.summary.scalar("loss", loss)

			train_op = tf.train.AdagradOptimizer(0.01).minimize(
							loss, global_step=global_step)

			# Test trained model
			label = tf.argmax(y_, 1, name="label")
			prediction = tf.argmax(y, 1,name="prediction")
			correct_prediction = tf.equal(prediction, label)

			accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
			tf.summary.scalar("acc", accuracy)

			saver = tf.train.Saver()
			summary_op = tf.summary.merge_all()
			init_op = tf.global_variables_initializer()

		# Create a "supervisor", which oversees the training process and stores model state into HDFS
#		logdir = TFNode.hdfs_path(ctx, args.model)
		logdir = "hdfs:///tmp/" + args.model
		print("tensorflow model path: {0}".format(logdir))
		summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

		if args.mode == "train":
			sv = tf.train.Supervisor(is_chief=(task_index == 0),
								logdir=logdir,
								init_op=init_op,
								summary_op=None,
								saver=saver,
								global_step=global_step,
								summary_writer=summary_writer,
								stop_grace_secs=300,
								save_model_secs=10)
		else:
			sv = tf.train.Supervisor(is_chief=(task_index == 0),
								logdir=logdir,
								summary_op=None,
								saver=saver,
								global_step=global_step,
								stop_grace_secs=300,
								save_model_secs=0)

		# The supervisor takes care of session initialization, restoring from
		# a checkpoint, and closing when done or an error occurs.
		with sv.managed_session(server.target) as sess:
			print("{0} session ready".format(datetime.now().isoformat()))

			# Loop until the supervisor shuts down or 1000000 steps have completed.
			step = 0
			tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")
			while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
				# Run a training step asynchronously.
				# See `tf.train.SyncReplicasOptimizer` for additional details on how to
				# perform *synchronous* training.

				# using feed_dict
				batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
				feed = {x: batch_xs, y_: batch_ys}

				if len(batch_xs) > 0:
					if args.mode == "train":
						_, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
						# print accuracy and save model checkpoint to HDFS every 100 steps
						if (step % 100 == 0):
							print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys})))

						if sv.is_chief:
							summary_writer.add_summary(summary, step)
							
					else: # args.mode == "inference"
						labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed)

						results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)]
						tf_feed.batch_results(results)
						print("acc: {0}".format(acc))

			if sv.should_stop() or step >= args.steps:
				tf_feed.terminate()
				writeFileToHDFS()

		# Ask for all the services to stop.
		print("{0} stopping supervisor".format(datetime.now().isoformat()))
		sv.stop()
예제 #21
0
        def _spark_train(args, ctx):
            """Basic linear regression in a distributed TF cluster using InputMode.SPARK"""
            import tensorflow as tf
            from tensorflowonspark import TFNode

            class ExportHook(tf.train.SessionRunHook):
                def __init__(self, export_dir, input_tensor, output_tensor):
                    self.export_dir = export_dir
                    self.input_tensor = input_tensor
                    self.output_tensor = output_tensor

                def end(self, session):
                    print("{} ======= Exporting to: {}".format(
                        datetime.now().isoformat(), self.export_dir))
                    signatures = {
                        "test_key": {
                            'inputs': {
                                'features': self.input_tensor
                            },
                            'outputs': {
                                'prediction': self.output_tensor
                            },
                            'method_name':
                            tf.saved_model.signature_constants.
                            PREDICT_METHOD_NAME
                        }
                    }
                    TFNode.export_saved_model(session, self.export_dir,
                                              "test_tag", signatures)
                    print("{} ======= Done exporting".format(
                        datetime.now().isoformat()))

            tf.reset_default_graph(
            )  # reset graph in case we're re-using a Spark python worker

            cluster, server = TFNode.start_cluster_server(ctx)
            if ctx.job_name == "ps":
                server.join()
            elif ctx.job_name == "worker":
                with tf.device(
                        tf.train.replica_device_setter(
                            worker_device="/job:worker/task:%d" %
                            ctx.task_index,
                            cluster=cluster)):
                    x = tf.placeholder(tf.float32, [None, 2], name='x')
                    y_ = tf.placeholder(tf.float32, [None, 1], name='y_')
                    w = tf.Variable(tf.truncated_normal([2, 1]), name='w')
                    y = tf.matmul(x, w, name='y')
                    y2 = tf.square(
                        y, name="y2"
                    )  # extra/optional output for testing multiple output tensors
                    global_step = tf.train.get_or_create_global_step()
                    cost = tf.reduce_mean(tf.square(y_ - y), name='cost')
                    optimizer = tf.train.GradientDescentOptimizer(
                        0.5).minimize(cost, global_step)

                chief_hooks = [
                    ExportHook(ctx.absolute_path(args.export_dir), x, y)
                ] if args.export_dir else []
                with tf.train.MonitoredTrainingSession(
                        master=server.target,
                        is_chief=(ctx.task_index == 0),
                        checkpoint_dir=args.model_dir,
                        chief_only_hooks=chief_hooks) as sess:
                    tf_feed = TFNode.DataFeed(ctx.mgr,
                                              input_mapping=args.input_mapping)
                    while not sess.should_stop() and not tf_feed.should_stop():
                        batch = tf_feed.next_batch(10)
                        if args.input_mapping:
                            if len(batch['x']) > 0:
                                feed = {x: batch['x'], y_: batch['y_']}
                            sess.run(optimizer, feed_dict=feed)
예제 #22
0
def map_fun(args, ctx):
  # from com.yahoo.ml.tf import TFNode
  from tensorflowonspark import TFNode
  from datetime import datetime
  import math
  import numpy
  import tensorflow as tf
  import time

  worker_num = ctx.worker_num #worker数量
  job_name = ctx.job_name # job名
  task_index = ctx.task_index # 任务索引
  cluster_spec = ctx.cluster_spec # 集群

  IMAGE_PIXELS=10 # 图像大小 mnist 28x28x1  (后续参考自己图像大小进行修改)
  channels=4
  num_class=2

  # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
  if job_name == "ps": # ps节点(主节点)
    time.sleep((worker_num + 1) * 5)

  # Parameters
  hidden_units = 128 # NN隐藏层
  batch_size   = args.batch_size #每批次训练的样本数

  # Get TF cluster and server instances
  cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

  def feed_dict(batch):
    # Convert from [(images, labels)] to two numpy arrays of the proper type
    images = []
    labels = []
    for item in batch:
      images.append(item[0])
      labels.append(item[1])
    xs = numpy.array(images)
    xs = xs.astype(numpy.float32)
    xs = xs/255.0 # 数据归一化
    ys = numpy.array(labels)
    ys = ys.astype(numpy.uint8)
    return (xs, ys)

  if job_name == "ps":
    server.join()
  elif job_name == "worker":

    # Assigns ops to the local worker by default.
    with tf.device(tf.train.replica_device_setter(
        worker_device="/job:worker/task:%d" % task_index,
        cluster=cluster)):

      #-------------普通的NN模型(可以修改成自己的模型)---------------------------------#
      #↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓#
      '''
      # Variables of the hidden layer
      hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
                              stddev=1.0 / IMAGE_PIXELS), name="hid_w")
      hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
      # tf.summary.histogram("hidden_weights", hid_w)

      # Variables of the softmax layer
      sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
                              stddev=1.0 / math.sqrt(hidden_units)), name="sm_w")
      sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
      # tf.summary.histogram("softmax_weights", sm_w)
      '''

      # Create some wrappers for simplicity
      def conv2d(x, W, b, strides=1):
        # Conv2D wrapper, with bias and relu activation
        x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME')
        x = tf.nn.bias_add(x, b)  # strides中间两个为1 表示x,y方向都不间隔取样
        return tf.nn.relu(x)

      def maxpool2d(x, k=2):
        # MaxPool2D wrapper
        return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
                              padding='SAME')  # strides中间两个为2 表示x,y方向都间隔1个取样

      # Store layers weight & bias
      weights = {
        # 5x5 conv, 3 input, 32 outputs 彩色图像3个输入(3个频道),灰度图像1个输入
        'wc1': tf.Variable(tf.random_normal([5, 5, channels, 32])),  # 5X5的卷积模板
        # 5x5 conv, 32 inputs, 64 outputs
        'wc2': tf.Variable(tf.random_normal([5, 5, 32, 64])),
        # fully connected, 7*7*64 inputs, 1024 outputs
        'wd1': tf.Variable(tf.random_normal([(1+IMAGE_PIXELS // 4) * (1+IMAGE_PIXELS // 4) * 64, 1024])),
        # 1024 inputs, 10 outputs (class prediction)
        'out': tf.Variable(tf.random_normal([1024, num_class]))
      }

      biases = {
        'bc1': tf.Variable(tf.random_normal([32])),
        'bc2': tf.Variable(tf.random_normal([64])),
        'bd1': tf.Variable(tf.random_normal([1024])),
        'out': tf.Variable(tf.random_normal([num_class]))
      }


      # Placeholders or QueueRunner/Readers for input data
      x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS*channels], name="x") # mnist 28*28*1
      y_ = tf.placeholder(tf.float32, [None, num_class], name="y_")

      x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, channels]) # mnist 数据 28x28x1 (灰度图 波段为1)
      # tf.summary.image("x_img", x_img)


      # 改成卷积模型
      conv1 = conv2d(x_img, weights['wc1'], biases['bc1'])
      conv1 = maxpool2d(conv1, k=2)
      conv2 = conv2d(conv1, weights['wc2'], biases['bc2'])
      conv2 = maxpool2d(conv2, k=2)
      fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]])
      fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
      fc1 = tf.nn.relu(fc1)
      if args.mode == "train":
        fc1 = tf.nn.dropout(fc1, 0.7)
      y = tf.add(tf.matmul(fc1, weights['out']), biases['out'])

      '''
      hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) # tf.nn.add(tf.nn.matmul(x,hid_w),hid_b)
      hid = tf.nn.relu(hid_lin)

      y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))
      '''
      # global_step = tf.Variable(0)

      global_step = tf.Variable(0, name="global_step", trainable=False)

      # loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))

      loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_,logits=y))

      # tf.summary.scalar("loss", loss)

      train_op = tf.train.AdagradOptimizer(0.01).minimize(
          loss, global_step=global_step)

      # Test trained model
      label = tf.argmax(y_, 1, name="label")
      prediction = tf.argmax(y, 1,name="prediction")
      correct_prediction = tf.equal(prediction, label)

      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
      # tf.summary.scalar("acc", accuracy)

      saver = tf.train.Saver()
      # summary_op = tf.summary.merge_all()
      init_op = tf.global_variables_initializer()

      # ↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑#
      #---------------上面的模型可以修改成自己的模型------------------------------#

    # Create a "supervisor", which oversees the training process and stores model state into HDFS
    logdir = TFNode.hdfs_path(ctx, args.model)
    print("tensorflow model path: {0}".format(logdir)) #
    log.info("tensorflow model path: {0}".format(logdir))
    # summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

    if args.mode == "train":
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               init_op=init_op,
                               # summary_op=None,
                               saver=saver,
                               # recovery_wait_secs=1,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=10)
    else:
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               # summary_op=None,
                               saver=saver,
                               # recovery_wait_secs=1,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=0)

    # The supervisor takes care of session initialization, restoring from
    # a checkpoint, and closing when done or an error occurs.
    with sv.managed_session(server.target) as sess: # 打开session
      logging.basicConfig(level=logging.INFO)

      print("{0} session ready".format(datetime.now().isoformat()))
      log.info("{0} session ready".format(datetime.now().isoformat()))
      # Loop until the supervisor shuts down or 1000000 steps have completed.
      step = 0
      tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")
      while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
        # Run a training step asynchronously.
        # See `tf.train.SyncReplicasOptimizer` for additional details on how to
        # perform *synchronous* training.

        # using feed_dict
        batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
        feed = {x: batch_xs, y_: batch_ys}

        if len(batch_xs) > 0:
          if args.mode == "train":
            # _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
            _, step = sess.run([train_op,  global_step], feed_dict=feed)
            # print accuracy and save model checkpoint to HDFS every 100 steps
            if (step % 100 == 0):
              print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys})))
              log.info("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys})))
            if sv.is_chief:
              pass
              # summary_writer.add_summary(summary, step)
          else: # args.mode == "inference"
            labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed)

            results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)]
            tf_feed.batch_results(results)
            print("acc: {0}".format(acc))
            log.info("acc: {0}".format(acc))
      if sv.should_stop() or step >= args.steps:
        tf_feed.terminate()

    # Ask for all the services to stop.
    print("{0} stopping supervisor".format(datetime.now().isoformat()))
    log.info("{0} stopping supervisor".format(datetime.now().isoformat()))
    sv.stop()
예제 #23
0
def main_fun(args, ctx):
    import numpy as np
    import tensorflow as tf
    import tensorflow_datasets as tfds
    from tensorflowonspark import TFNode

    tfds.disable_progress_bar()

    BUFFER_SIZE = args.buffer_size
    BATCH_SIZE = args.batch_size
    LEARNING_RATE = args.learning_rate

    tf_feed = TFNode.DataFeed(ctx.mgr)

    def rdd_generator():
        while not tf_feed.should_stop():
            batch = tf_feed.next_batch(1)
            if len(batch) > 0:
                example = batch[0]
                image = np.array(example[0]).astype(np.float32) / 255.0
                image = np.reshape(image, (28, 28, 1))
                label = np.array(example[1]).astype(np.float32)
                label = np.reshape(label, (1, ))
                yield (image, label)
            else:
                return

    def input_fn(mode, input_context=None):
        if mode == tf.estimator.ModeKeys.TRAIN:
            # Note: Spark is responsible for feeding data via streaming RDD
            ds = tf.data.Dataset.from_generator(
                rdd_generator, (tf.float32, tf.float32),
                (tf.TensorShape([28, 28, 1]), tf.TensorShape([1])))
            return ds.batch(BATCH_SIZE)
        else:
            raise Exception("I'm evaluating: mode={}, input_context={}".format(
                mode, input_context))

            def scale(image, label):
                image = tf.cast(image, tf.float32) / 255.0
                return image, label

            mnist = tfds.load(name='mnist', with_info=True, as_supervised=True)
            ds = mnist['test']
            if input_context:
                ds = ds.shard(input_context.num_input_pipelines,
                              input_context.input_pipeline_id)
            return ds.map(scale).batch(BATCH_SIZE)

    def serving_input_receiver_fn():
        features = tf.compat.v1.placeholder(dtype=tf.float32,
                                            shape=[None, 28, 28, 1],
                                            name='features')
        receiver_tensors = {'features': features}
        return tf.estimator.export.ServingInputReceiver(
            receiver_tensors, receiver_tensors)

    def model_fn(features, labels, mode):
        model = tf.keras.Sequential([
            tf.keras.layers.Conv2D(32,
                                   3,
                                   activation='relu',
                                   input_shape=(28, 28, 1)),
            tf.keras.layers.MaxPooling2D(),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(10, activation='softmax')
        ])
        logits = model(features, training=False)

        if mode == tf.estimator.ModeKeys.PREDICT:
            predictions = {'logits': logits}
            return tf.estimator.EstimatorSpec(mode, predictions=predictions)

        optimizer = tf.compat.v1.train.GradientDescentOptimizer(
            learning_rate=LEARNING_RATE)
        loss = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction=tf.keras.losses.Reduction.NONE)(labels,
                                                                        logits)
        loss = tf.reduce_sum(input_tensor=loss) * (1. / BATCH_SIZE)
        if mode == tf.estimator.ModeKeys.EVAL:
            return tf.estimator.EstimatorSpec(mode, loss=loss)

        return tf.estimator.EstimatorSpec(
            mode=mode,
            loss=loss,
            train_op=optimizer.minimize(
                loss, tf.compat.v1.train.get_or_create_global_step()))

    # Note: the original example used MultiWorkerMirroredStrategy which is a synchronous training strategy.
    # Since streaming data arrives irregularly, we must use the asynchronous ParameterServerStrategy
    # to allow data to be processed as it arrives and to avoid deadlocks.
    # strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
    strategy = tf.distribute.experimental.ParameterServerStrategy()
    config = tf.estimator.RunConfig(train_distribute=strategy,
                                    save_checkpoints_steps=100)

    classifier = tf.estimator.Estimator(model_fn=model_fn,
                                        model_dir=args.model_dir,
                                        config=config)

    # exporter = tf.estimator.FinalExporter("serving", serving_input_receiver_fn=serving_input_receiver_fn)

    tf.estimator.train_and_evaluate(
        classifier,
        train_spec=tf.estimator.TrainSpec(input_fn=input_fn),
        eval_spec=tf.estimator.EvalSpec(input_fn=input_fn)
        # eval_spec=tf.estimator.EvalSpec(input_fn=input_fn, exporters=exporter)
    )

    if ctx.job_name == 'chief':
        print("Exporting saved_model to {}".format(args.export_dir))
        classifier.export_saved_model(args.export_dir,
                                      serving_input_receiver_fn)
예제 #24
0
def mainFun(args, ctx):
	import numpy as np
	import tensorflow as tf
	from tensorflow.keras.models import Sequential
	from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, Input, BatchNormalization
	import tensorflow.keras as keras
	from tensorflowonspark import compat, TFNode

	# Setting distributed model strategy
	strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

	def buildAndCompileModel():

		# Initiating model
		model = Sequential()

		# Building model structure
		model.add(Input(shape=(1025, 50, 1)))

		# First convolution and pooling step
		model.add(Conv2D(16, kernel_size=[3,3], activation='relu', data_format='channels_last'))
		model.add(MaxPool2D(pool_size=[3,3], data_format='channels_last'))
		model.add(Dropout(0.2))

		# Second convolution and pooling step
		model.add(Conv2D(32, kernel_size=[3,3], activation='relu', data_format='channels_last'))
		model.add(MaxPool2D(pool_size=[3,3], data_format='channels_last'))
		model.add(Dropout(0.2))

		# Flattening output of convolution to pass on to Dense layers
		model.add(Flatten())
		model.add(BatchNormalization())
		model.add(Dense(128, activation='relu'))
		model.add(BatchNormalization())
		model.add(Dense(128, activation='relu'))

		# Output layer
		model.add(Dense(30, activation='softmax'))

		# Compiling model
		model.compile(
			loss='sparse_categorical_crossentropy',
			optimizer='adam',
			metrics=['accuracy']
		)

		return model

	# Opening up datafeed to iterate over entries
	tfFeed = TFNode.DataFeed(ctx.mgr, False)

	# Function to split data into features and labels
	def rddGenerator():
		while not tfFeed.should_stop():
			batch = tfFeed.next_batch(1)
			if len(batch) > 0:
				example = batch[0]

				# Splitting into X and y
				X = np.array(example[1]).astype(np.float32)
				y = np.array(example[0])

				# Encoding labels
				_, y = np.unique(y, return_inverse=True)
				y = y.astype(np.float32)

				# Adjusting data shape
				X = X.reshape(-1, 50, 1)

				# Returning features and labels as separate arrays
				yield (X, y)
			else:
				return

	# Creating Tensorflow Dataset
	ds = tf.data.Dataset.from_generator(rddGenerator, (tf.float32, tf.float32), (tf.TensorShape([1025, 50, 1]), tf.TensorShape([1])))
	ds = ds.batch(1)

	# Instantiating Model
	with strategy.scope():
		multiWorkerModel = buildAndCompileModel()

	# Defining Training Parameters
	stepsPerEpoch = 600 / 1
	stepsPerWorker = stepsPerEpoch / 1
	maxStepsPerWorker = stepsPerWorker * 0.9

	# Fitting Model
	multiWorkerModel.fit(x = ds, epochs = 2, steps_per_epoch = stepsPerWorker)

	# Exporting log files for Tensorboard
	from tensorflow_estimator.python.estimator.export import export_lib
	exportDir = export_lib.get_timestamped_export_dir(args.export_dir)
	compat.export_saved_model(multiWorkerModel, exportDir, ctx.job_name == 'chief')

  # terminating feed tells spark to skip processing further partitions
	tfFeed.terminate()
예제 #25
0
        def _spark_train(args, ctx):
            """Basic linear regression in a distributed TF cluster using InputMode.SPARK"""
            import tensorflow as tf
            from tensorflowonspark import TFNode

            tf.compat.v1.reset_default_graph()
            strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

            with strategy.scope():
                model = Sequential()
                model.add(Dense(1, activation='linear', input_shape=[2]))
                model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.2),
                              loss='mse',
                              metrics=['mse'])
                model.summary()

            tf_feed = TFNode.DataFeed(ctx.mgr,
                                      input_mapping=args.input_mapping)

            def rdd_generator():
                while not tf_feed.should_stop():
                    batch = tf_feed.next_batch(1)
                    if len(batch['x']) > 0:
                        features = batch['x'][0]
                        label = batch['y_'][0]
                        yield (features, label)
                    else:
                        return

            ds = tf.data.Dataset.from_generator(
                rdd_generator, (tf.float32, tf.float32),
                (tf.TensorShape([2]), tf.TensorShape([1])))
            ds = ds.batch(args.batch_size)

            # disable auto-sharding dataset
            options = tf.data.Options()
            options.experimental_distribute.auto_shard = False
            ds = ds.with_options(options)

            # only train 90% of each epoch to account for uneven RDD partition sizes
            steps_per_epoch = 1000 * 0.9 // (args.batch_size * ctx.num_workers)

            tf.io.gfile.makedirs(args.model_dir)
            filepath = args.model_dir + "/weights-{epoch:04d}"
            callbacks = [
                tf.keras.callbacks.ModelCheckpoint(
                    filepath=filepath,
                    verbose=1,
                    load_weights_on_restart=True,
                    save_weights_only=True)
            ]

            model.fit(ds,
                      epochs=args.epochs,
                      steps_per_epoch=steps_per_epoch,
                      callbacks=callbacks)

            # This fails with: "NotImplementedError: `fit_generator` is not supported for models compiled with tf.distribute.Strategy"
            # model.fit_generator(ds, epochs=args.epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks)

            if ctx.job_name == 'chief' and args.export_dir:
                print("exporting model to: {}".format(args.export_dir))
                tf.keras.experimental.export_saved_model(
                    model, args.export_dir)

            tf_feed.terminate()
예제 #26
0
def map_fun(args, ctx):
    # from com.yahoo.ml.tf import TFNode
    from tensorflowonspark import TFNode
    from datetime import datetime
    import math
    import numpy
    import tensorflow as tf
    from tensorflow.contrib.layers.python.layers import batch_norm
    import time
    import os

    worker_num = ctx.worker_num  #worker数量
    job_name = ctx.job_name  # job名
    task_index = ctx.task_index  # 任务索引
    cluster_spec = ctx.cluster_spec  # 集群

    IMAGE_PIXELS = 2  # 图像大小 mnist 28x28x1  (后续参考自己图像大小进行修改)
    channels = 3
    num_class = 2
    # global dropout
    dropout = args.dropout
    # Parameters
    # hidden_units = 128 # NN隐藏层
    # training_epochs=args.epochs
    batch_size = args.batch_size  #每批次训练的样本数
    # img_nums=630000
    # global learning_rate
    # learning_rate=args.learning_rate
    INITIAL_LEARNING_RATE = args.learning_rate
    # flag=True

    # batch_size=200

    num_examples_per_epoch_for_train = (4015 - 1)**2  # 每次迭代的样本数
    num_batches_per_epoch = int(num_examples_per_epoch_for_train / batch_size)
    num_epochs_per_decay = 1.2
    learning_rate_decay_rate = 0.8
    learning_rate_decay_steps = int(num_batches_per_epoch *
                                    num_epochs_per_decay)
    """
  # ---------设置动态学习效率
  # Constants describing the training process.
  # MOVING_AVERAGE_DECAY = 0.9999     # The decay to use for the moving average.
  NUM_EPOCHS_PER_DECAY = batch_size  # Epochs after which learning rate decays.
  LEARNING_RATE_DECAY_FACTOR = 0.1  # Learning rate decay factor.
  INITIAL_LEARNING_RATE = 0.1  # Initial learning rate.

  global_step1 = training_epochs * (img_nums // batch_size)  # Integer Variable counting the number of training steps
  # Variables that affect learning rate.
  num_batches_per_epoch = img_nums / batch_size
  decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)

  # Decay the learning rate exponentially based on the number of steps.
  learning_rate = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                             global_step1,
                                             decay_steps,
                                             LEARNING_RATE_DECAY_FACTOR,
                                             staircase=True)
# 设置动态学习效率----------  
"""

    # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
    if job_name == "ps":  # ps节点(主节点)
        time.sleep((worker_num + 1) * 5)

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

    def feed_dict(batch):
        # Convert from [(images, labels)] to two numpy arrays of the proper type
        images = []
        labels = []
        if args.mode != 'inference':
            numpy.random.shuffle(batch)  # 随机打乱
        for item in batch:
            images.append(item[0])
            labels.append(item[1])
        xs = numpy.array(images)
        xs = xs.astype(numpy.float32)
        # xs = xs/255.0 # 数据归一化
        # Z-score标准化方法
        # mean = numpy.reshape(numpy.average(xs, 1), [numpy.shape(xs)[0], 1])
        # std = numpy.reshape(numpy.std(xs, 1), [numpy.shape(xs)[0], 1])
        # xs = (xs - mean) / std

        # min-max标准化(Min-Max Normalization
        max_ = numpy.reshape(numpy.max(xs, 1), [numpy.shape(xs)[0], 1])
        min_ = numpy.reshape(numpy.min(xs, 1), [numpy.shape(xs)[0], 1])

        xs = (xs - min_) / (max_ - min_)
        ys = numpy.array(labels)
        if args.mode != 'inference':
            ys = ys.astype(numpy.uint8)
        else:
            ys = ys.astype(numpy.uint16)
        return (xs, ys)

    def batch_norm_layer(inputT, is_training=True, scope=None):
        # Note: is_training is tf.placeholder(tf.bool) type
        return tf.cond(is_training,
                       lambda: batch_norm(inputT,
                                          is_training=True,
                                          center=True,
                                          scale=True,
                                          activation_fn=tf.nn.relu,
                                          decay=0.9,
                                          scope=scope),
                       lambda: batch_norm(inputT,
                                          is_training=False,
                                          center=True,
                                          scale=True,
                                          activation_fn=tf.nn.relu,
                                          decay=0.9,
                                          scope=scope))  # , reuse = True))

    if job_name == "ps":
        server.join()
    elif job_name == "worker":

        # Assigns ops to the local worker by default.
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % task_index,
                    cluster=cluster)):

            # Create some wrappers for simplicity
            def conv2d(x, W, b, strides=1):
                # Conv2D wrapper, with bias and relu activation
                x = tf.nn.conv2d(x,
                                 W,
                                 strides=[1, strides, strides, 1],
                                 padding='SAME')
                x = tf.nn.bias_add(x, b)  # strides中间两个为1 表示x,y方向都不间隔取样
                return tf.nn.relu(x)

            def maxpool2d(x, k=2):
                # MaxPool2D wrapper
                return tf.nn.max_pool(
                    x,
                    ksize=[1, k, k, 1],
                    strides=[1, k, k, 1],
                    padding='SAME')  # strides中间两个为2 表示x,y方向都间隔1个取样

            # Store layers weight & bias
            weights = {
                # 5x5 conv, 3 input, 32 outputs 彩色图像3个输入(3个频道),灰度图像1个输入
                'wc1':
                tf.get_variable('wc1', [3, 3, channels, 128],
                                dtype=tf.float32,
                                initializer=tf.truncated_normal_initializer,
                                regularizer=tf.nn.l2_loss),  # 5X5的卷积模板

                # 5x5 conv, 32 inputs, 64 outputs
                'wc2':
                tf.get_variable('wc2', [3, 3, 32, 64],
                                dtype=tf.float32,
                                initializer=tf.truncated_normal_initializer,
                                regularizer=tf.nn.l2_loss),

                # fully connected, 7*7*64 inputs, 1024 outputs
                'wd1':
                tf.Variable(
                    tf.random_normal([
                        (IMAGE_PIXELS // 2) * (IMAGE_PIXELS // 2) * 128, 1024
                    ])),
                # 1024 inputs, 10 outputs (class prediction)
                'out':
                tf.Variable(tf.random_normal([1024, num_class]))
            }

            biases = {
                'bc1':
                tf.get_variable('bc1', [128],
                                dtype=tf.float32,
                                initializer=tf.truncated_normal_initializer,
                                regularizer=tf.nn.l2_loss),
                'bc2':
                tf.get_variable('bc2', [64],
                                dtype=tf.float32,
                                initializer=tf.truncated_normal_initializer,
                                regularizer=tf.nn.l2_loss),
                'bd1':
                tf.Variable(tf.random_normal([1024])),
                'out':
                tf.Variable(tf.random_normal([num_class]))
            }

            # Placeholders or QueueRunner/Readers for input data
            x = tf.placeholder(tf.float32,
                               [None, IMAGE_PIXELS * IMAGE_PIXELS * channels],
                               name="x")  # mnist 28*28*1
            if args.mode != 'inference':
                y_ = tf.placeholder(tf.float32, [None, num_class], name="y_")
            else:
                y_ = tf.placeholder(tf.float32, [None, 4], name="y_")
                label = y_
            keep = tf.placeholder(tf.float32)
            is_training = tf.placeholder(tf.bool, name='MODE')

            x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, channels
                                   ])  # mnist 数据 28x28x1 (灰度图 波段为1)

            # x_img=batch_norm_layer(x_img,is_training)
            x_img = tf.nn.lrn(x_img,
                              depth_radius=5,
                              bias=2.0,
                              alpha=1e-3,
                              beta=0.75)  # lrn层

            # 改成卷积模型
            conv1 = conv2d(x_img, weights['wc1'], biases['bc1'])
            conv1 = maxpool2d(conv1, k=2)  # shape [N,1,1,32]
            conv1 = tf.nn.lrn(conv1,
                              depth_radius=5,
                              bias=2.0,
                              alpha=1e-3,
                              beta=0.75)  # lrn层
            # conv2 = conv2d(conv1, weights['wc2'], biases['bc2'])
            # conv2 = maxpool2d(conv2, k=2)  # shape [N,1,1,32]
            # conv1 = tf.nn.dropout(conv1, keep+0.1)
            fc1 = tf.reshape(conv1,
                             [-1, weights['wd1'].get_shape().as_list()[0]])
            fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
            # fc1=batch_norm_layer(fc1, is_training)
            fc1 = tf.nn.relu(fc1)
            fc1 = tf.nn.dropout(fc1, keep)
            y = tf.add(tf.matmul(fc1, weights['out']), biases['out'])
            prediction = tf.argmax(y, 1, name="prediction")
            # y=tf.sigmoid(y) # 二分类 多分类加 tf.nn.softmax()

            global_step = tf.Variable(0, name="global_step", trainable=False)

            # loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
            if args.mode != 'inference':
                loss = tf.reduce_mean(
                    tf.nn.softmax_cross_entropy_with_logits(labels=y_,
                                                            logits=y))

                # learning_rate=tf.train.exponential_decay(INITIAL_LEARNING_RATE,global_step,
                #                                          learning_rate_decay_steps,learning_rate_decay_rate,
                #                                          staircase=False)

                # learning_rate = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                #                                            global_step,
                #                                            10000,
                #                                            0.96,
                #                                            staircase=False)
                learning_rate = tf.train.polynomial_decay(
                    INITIAL_LEARNING_RATE, global_step, 3000000, 1e-5, 0.8,
                    True)
                # 运行steps:decay_steps>1000:1
                # train_op = tf.train.AdagradOptimizer(learning_rate).minimize(
                #     loss, global_step=global_step)

                train_op = tf.train.GradientDescentOptimizer(
                    learning_rate).minimize(loss, global_step=global_step)

                # Test trained model
                label = tf.argmax(y_, 1, name="label")
                # prediction = tf.argmax(y, 1,name="prediction")
                correct_prediction = tf.equal(prediction, label)

                accuracy = tf.reduce_mean(tf.cast(correct_prediction,
                                                  tf.float32),
                                          name="accuracy")
                # tf.summary.scalar("acc", accuracy)

            saver = tf.train.Saver()

            # summary_op = tf.summary.merge_all()
            init_op = tf.global_variables_initializer()

        # Create a "supervisor", which oversees the training process and stores model state into HDFS
        logdir = TFNode.hdfs_path(ctx, args.model)
        print("tensorflow model path: {0}".format(logdir))  #
        # log.info("tensorflow model path: {0}".format(logdir))
        # summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

        if args.mode == "train":
            sv = tf.train.Supervisor(
                is_chief=(task_index == 0),
                logdir=logdir,
                init_op=init_op,
                # summary_op=None,
                saver=saver,
                # saver=None, # None 不自动保存模型
                # recovery_wait_secs=1,
                global_step=global_step,
                stop_grace_secs=300,
                save_model_secs=10)
        elif args.mode == "retrain":
            sv = tf.train.Supervisor(
                is_chief=(task_index == 0),
                logdir=logdir,
                # init_op=init_op,
                # summary_op=None,
                # saver=None, # None 不自动保存模型
                saver=saver,
                # recovery_wait_secs=1,
                global_step=global_step,
                stop_grace_secs=300,
                save_model_secs=10)
        else:
            sv = tf.train.Supervisor(
                is_chief=(task_index == 0),
                logdir=logdir,
                # summary_op=None,
                saver=saver,
                # recovery_wait_secs=1,
                global_step=global_step,
                stop_grace_secs=300,
                save_model_secs=0)

        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
        with sv.managed_session(server.target) as sess:  # 打开session
            """
      # 验证之前是否已经保存了检查点文件
      ckpt = tf.train.get_checkpoint_state(logdir)
      if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess,ckpt.model_checkpoint_path)
      """
            # global_step=int(ckpt.model_checkpoint_path.rsplit('-',1)[1])
            # else:
            #   sess.run(init_op)

            print("{0} session ready".format(datetime.now().isoformat()))
            # log.info("{0} session ready".format(datetime.now().isoformat()))
            # Loop until the supervisor shuts down or 1000000 steps have completed.
            step = 0
            # acc1=args.acc
            # n = 0
            tf_feed = TFNode.DataFeed(
                ctx.mgr, args.mode == "train" or args.mode == "retrain")
            while not sv.should_stop() and not tf_feed.should_stop(
            ) and step < args.steps:
                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.

                # using feed_dict
                batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
                feed = {
                    x: batch_xs,
                    y_: batch_ys,
                    keep: dropout,
                    is_training: True
                }
                if len(batch_xs) > 0:
                    if args.mode == "train" or args.mode == "retrain":
                        # _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
                        _, step = sess.run([train_op, global_step],
                                           feed_dict=feed)
                        '''
            if dropout > 0.2:
                if step%10000==0:dropout=dropout*0.85
            else:
                dropout=0.7
            '''
                        """
            acc=sess.run(accuracy,{x: batch_xs, y_: batch_ys,keep:1.})
            if acc>acc1:
              if flag and acc>0.9:
                os.popen('hdfs dfs -rm -r '+logdir+'/*') # 清空hdfs上面文件夹下的所有文件
                flag=False
              # acc1=acc # 训练达到一定程度加上
              saver.save(sess,logdir+'/'+args.model_name,global_step=step)
              n=0
              # learning_rate=1e-3
              # dropout=.7
            else:
              n += 1
              if n > 100:
                ckpt1 = tf.train.get_checkpoint_state(logdir)
                if ckpt1 and ckpt1.model_checkpoint_path:
                  saver.restore(sess, ckpt1.model_checkpoint_path)
                if learning_rate > 1e-7:
                  # learning_rate = learning_rate * .96**(step/10)
                  learning_rate = learning_rate * .8
                else:
                  learning_rate = 1e-3
                if dropout > 0.2:
                  dropout = dropout * .85
                else:
                  dropout = .7
            """

                        # print accuracy and save model checkpoint to HDFS every 100 steps
                        if (step % 100 == 0):
                            print("{0} step: {1} accuracy: {2}".format(
                                datetime.now().isoformat(), step,
                                sess.run(
                                    accuracy, {
                                        x: batch_xs,
                                        y_: batch_ys,
                                        keep: 1.,
                                        is_training: False
                                    })))
                            # log.info("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys})))
                        if sv.is_chief:
                            pass
                            # summary_writer.add_summary(summary, step)
                    elif args.mode == 'test':
                        feed2 = {
                            x: batch_xs,
                            y_: batch_ys,
                            keep: 1.,
                            is_training: False
                        }
                        labels, preds, acc = sess.run(
                            [label, prediction, accuracy], feed_dict=feed2)
                        results = [
                            "{0} Label: {1}, Prediction: {2}".format(
                                datetime.now().isoformat(), l, p)
                            for l, p in zip(labels, preds)
                        ]
                        tf_feed.batch_results(results)
                        print("acc: {0}".format(acc))
                    else:  # args.mode == "inference"
                        feed2 = {
                            x: batch_xs,
                            y_: batch_ys,
                            keep: 1.,
                            is_training: False
                        }
                        # labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed2)
                        labels, preds = sess.run([label, prediction],
                                                 feed_dict=feed2)
                        # results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)]
                        results = [
                            "Label: {0}, Prediction: {1}".format(l, p)
                            for l, p in zip(labels, preds)
                        ]
                        tf_feed.batch_results(results)
                        # print("acc: {0}".format(acc))
                        # log.info("acc: {0}".format(acc))
            if sv.should_stop() or step >= args.steps:
                tf_feed.terminate()

        # Ask for all the services to stop.
        print("{0} stopping supervisor".format(datetime.now().isoformat()))
        # log.info("{0} stopping supervisor".format(datetime.now().isoformat()))
        sv.stop()
예제 #27
0
def map_fun(args, ctx):
    from tensorflowonspark import TFNode
    from datetime import datetime
    import math
    import numpy
    import tensorflow as tf
    import time
    import logging
    import cnn_lstm_ctc_ocr
    #import redis_logger_handler
    #redis_logger_handler.logging_setup(args.redis)

    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index
    cluster_spec = ctx.cluster_spec
    worker_name = '(worker:%s tf:%s idx:%s)' % (worker_num, job_name,
                                                task_index)

    logging.info(
        '{0} batch_size:{1} initial_learning_rate:{2} decay_steps:{3} decay_rate:{4} momentum:{5}'
        .format(worker_name, args.batch_size, args.initial_learning_rate,
                args.decay_steps, args.decay_rate, args.momentum))
    # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
    if job_name == "ps":
        time.sleep((worker_num + 1) * 5)

    # Parameters
    CHANNELS = 1
    IMAGE_WIDTH = 120
    IMAGE_HEIGHT = 45

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

    def sparse_tuple_from_label(sequences, dtype=numpy.int32):
        indices = []
        values = []
        for n, seq in enumerate(sequences):
            indices.extend(zip([n] * len(seq), range(len(seq))))
            values.extend(seq)
        indices = numpy.asarray(indices, dtype=numpy.int64)
        values = numpy.asarray(values, dtype=dtype)
        shape = numpy.asarray(
            [len(sequences),
             numpy.asarray(indices).max(0)[1] + 1],
            dtype=numpy.int64)
        return indices, values, shape

    def get_input_lens(sequences):
        lengths = numpy.asarray([58 for s in sequences], dtype=numpy.int64)
        return sequences, lengths

    def placeholder_inputs(image_width, image_height, channels):
        images_placeholder = tf.placeholder(
            tf.float32, [None, image_height, image_width, channels])
        labels_placeholder = tf.sparse_placeholder(tf.int32)
        seqlen_placeholder = tf.placeholder(tf.int32, [None])
        keep_prob = tf.placeholder(tf.float32)
        return images_placeholder, labels_placeholder, seqlen_placeholder, keep_prob

    def format_batch(data_set, batch_size, image_height, image_width,
                     channels):
        batch = data_set.next_batch(batch_size)
        images = []
        labels = []
        for item in batch:
            images.append(item[0])
            labels.append(item[1])
        xs = numpy.array(images)
        # [batch_size, height * width] => [batch_size, height, width, channels]
        xs = xs.reshape(batch_size, image_height, image_width, channels)
        xs = xs.astype(numpy.float32)
        xs = xs / 255.
        ys = labels
        return xs, ys

    def fill_feed_dict(xs,
                       ys,
                       images_pl,
                       labels_pl,
                       seqlen_pl,
                       keep_prob,
                       train=True):
        images_feed, seqlen_feed = get_input_lens(xs)
        labels_feed = sparse_tuple_from_label(ys)
        if train:
            feed_dict = {
                images_pl: images_feed,
                labels_pl: labels_feed,
                seqlen_pl: seqlen_feed,
                keep_prob: 0.5,
            }
        else:
            feed_dict = {
                images_pl: images_feed,
                labels_pl: labels_feed,
                seqlen_pl: seqlen_feed,
                keep_prob: 1,
            }
        return feed_dict

    def do_eval(sess, dense_decoded, lastbatch_err, learning_rate,
                images_placeholder, labels_placeholder, seqlen_placeholder,
                keep_prob, train, xs, ys):
        true_count = 0  # Counts the number of correct predictions.
        feed_dict = fill_feed_dict(xs, ys, images_placeholder,
                                   labels_placeholder, seqlen_placeholder,
                                   keep_prob, train)
        dd, lerr, lr = sess.run([dense_decoded, lastbatch_err, learning_rate],
                                feed_dict=feed_dict)
        #accuracy calculation
        for i, origin_label in enumerate(ys):
            decoded_label = [j for j in dd[i] if j != -1]
            if i < 10:
                logging.info('{0} seq {1} => origin:{2} decoded:{3}'.format(
                    worker_name, i, origin_label, decoded_label))
            if origin_label == decoded_label:
                true_count += 1
        #accuracy
        acc = true_count * 1.0 / len(ys)
        #print subsummary
        logging.info(
            "%s accuracy = %.3f, lastbatch_err = %.3f, learning_rate = %.8f" %
            (worker_name, acc, lerr, lr))

    if job_name == "ps":
        server.join()
    elif job_name == "worker":
        # Assigns ops to the local worker by default.
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % task_index,
                    cluster=cluster)):
            # Generate placeholders for the images, labels and seqlens.
            images_placeholder, labels_placeholder, seqlen_placeholder, keep_prob = placeholder_inputs(
                IMAGE_WIDTH, IMAGE_HEIGHT, CHANNELS)
            # Build a Graph that computes predictions from the inference model.
            #images_lp, seqlen_lp, num_features, num_layers, hidden_units
            logits = cnn_lstm_ctc_ocr.inference(images_placeholder,
                                                seqlen_placeholder, keep_prob,
                                                args.hidden_units, args.mode,
                                                args.batch_size)
            # Add to the Graph the Ops for loss calculation.
            #logits, labels_lp, seqlen_lp
            loss = cnn_lstm_ctc_ocr.loss(logits, labels_placeholder,
                                         seqlen_placeholder)
            tf.summary.scalar('loss', loss)
            # global counter
            global_step = tf.Variable(0, name='global_step', trainable=False)
            # Add to the Graph the Ops that calculate and apply gradients.
            #loss, initial_learning_rate, decay_steps, decay_rate, momentum
            train_op, learning_rate = cnn_lstm_ctc_ocr.training(
                loss, global_step, args.initial_learning_rate,
                args.decay_steps, args.decay_rate, args.momentum)
            # Add the Op to compare the logits to the labels during evaluation.
            dense_decoded, lerr = cnn_lstm_ctc_ocr.evaluation(
                logits, labels_placeholder, seqlen_placeholder)
            tf.summary.scalar('lerr', lerr)

            summary_op = tf.summary.merge_all()
            # Add the variable initializer Op.
            init_op = tf.global_variables_initializer()
            # Create a saver for writing training checkpoints.
            saver = tf.train.Saver()

        # Create a "supervisor", which oversees the training process and stores model state into HDFS
        logdir = TFNode.hdfs_path(ctx, args.model)
        logging.info("{0} tensorflow model path: {1}".format(
            worker_name, logdir))
        summary_writer = tf.summary.FileWriter("tensorboard_%d" % (worker_num),
                                               graph=tf.get_default_graph())

        if args.mode == "train":
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     init_op=init_op,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=60)
        else:
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=0)

        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
        validation_xs = None
        validation_ys = None
        validation_batchs = 10
        with sv.managed_session(server.target) as sess:
            logging.info("{0} session ready".format(worker_name))
            # Loop until the supervisor shuts down or 1000000 steps have completed.
            g_step = 0
            tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")
            # for do_eval samples
            if None == validation_xs or None == validation_ys:
                validation_xs, validation_ys = format_batch(
                    tf_feed, args.batch_size * validation_batchs, IMAGE_HEIGHT,
                    IMAGE_WIDTH, CHANNELS)
            while not sv.should_stop() and not tf_feed.should_stop(
            ) and g_step < (args.steps * args.epochs - validation_batchs):
                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.
                start_time = time.time()
                # using feed_dict
                xs, ys = format_batch(tf_feed, args.batch_size, IMAGE_HEIGHT,
                                      IMAGE_WIDTH, CHANNELS)
                feed_dict = fill_feed_dict(xs, ys, images_placeholder,
                                           labels_placeholder,
                                           seqlen_placeholder, keep_prob,
                                           args.mode == "train")
                # Run one step of the model.  The return values are the activations
                # from the `train_op` (which is discarded) and the `loss` Op.  To
                # inspect the values of your Ops or variables, you may include them
                # in the list passed to sess.run() and the value tensors will be
                # returned in the tuple from the call.
                _, loss_value, g_step = sess.run([train_op, loss, global_step],
                                                 feed_dict=feed_dict)
                duration = time.time() - start_time
                if g_step % 20 == 0:
                    # Print status to stdout.
                    logging.info(
                        '%s [g_step:%d epoch:%d/%d step:%d/%d] loss = %.2f (%.3f sec)'
                        % (worker_name, g_step, g_step / args.steps,
                           args.epochs, g_step % args.steps, args.steps,
                           loss_value, duration))
                # Write the summaries and print an overview fairly often.
                if g_step % 100 == 0:
                    # Update the events file.
                    if sv.is_chief:
                        summary = sess.run(summary_op, feed_dict=feed_dict)
                        summary_writer.add_summary(summary, g_step)
                        summary_writer.flush()

                # Save a checkpoint and evaluate the model periodically.
                if (g_step + 1) % 500 == 0 or (g_step + 1) == args.steps:
                    # Evaluate against the validation set.
                    logging.info('{0} ---- Validation Data Eval: ----'.format(
                        worker_name))
                    do_eval(sess, dense_decoded, lerr, learning_rate,
                            images_placeholder, labels_placeholder,
                            seqlen_placeholder, keep_prob,
                            args.mode == "train", validation_xs, validation_ys)

            if sv.should_stop() or g_step >= (args.steps * args.epochs -
                                              validation_batchs):
                logging.info("{0} terminating tf_feed".format(worker_name))
                tf_feed.terminate()

        # Ask for all the services to stop.
        logging.info("{0} stopping supervisor".format(worker_name))
        sv.stop()
예제 #28
0
def map_fun(args, ctx):
  from tensorflowonspark import TFNode
  from datetime import datetime
  import math
  import numpy
  import tensorflow as tf
  import time

  worker_num = ctx.worker_num
  job_name = ctx.job_name
  task_index = ctx.task_index
  cluster_spec = ctx.cluster_spec

  if job_name == "ps":
    time.sleep((worker_num + 1) * 5)

  batch_size = args.batch_size

  cluster, server = TFNode.start_cluster_server(ctx, 1)

  def feed_dict(batch):
    images = []
    labels = []
    for item in batch:
      images.append(item[0])
      labels.append(item[1])
    x_initial = numpy.array(images)
    x_objdump = x_initial[:,519:719]
    x_cnn = numpy.empty((0, 200), dtype=numpy.float64)
    for i in xrange(len(images)):  
      x_cnn_batch = numpy.zeros((200, 120), dtype=numpy.float64)
      for j in xrange(0, 200):
        x_cnn_batch[j, int(x_objdump[i, j])] = True
      x_cnn_batch = numpy.transpose(x_cnn_batch)
      x_cnn = numpy.append(x_cnn, x_cnn_batch, axis=0)
    x_peinfo = x_initial[:,0:519]
    ys = numpy.array(labels)
    return (x_peinfo.reshape(-1,519,1,1),x_cnn.reshape(-1, 200, 120, 1), ys)

  def conv2d(x, W):
      return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

  def max_pool_1(x):
      return tf.nn.avg_pool(x, ksize=[1, 2,1, 1], strides=[1, 2, 1, 1], padding='SAME')

  def max_pool_2(x):
      return tf.nn.avg_pool(x, ksize=[1, 100,1, 1], strides=[1, 100, 1, 1], padding='SAME')

  if job_name == "ps":
    server.join()
  elif job_name == "worker":
    with tf.device(tf.train.replica_device_setter(
        worker_device="/job:worker/task:%d" % task_index,
        cluster=cluster)):
      # Build NN-Network
      W_mlp_1 = tf.Variable(tf.truncated_normal([519,519],stddev=0.1), name="W_mlp_1") 
      b_mlp_1 = tf.Variable(tf.constant(0.1, shape=[519]),name="b_mlp_1")
      tf.summary.histogram("W_mlp_1", W_mlp_1)
      W_mlp_2 = tf.Variable(tf.truncated_normal([519,519],stddev=0.1), name="W_mlp_2") 
      b_mlp_2 = tf.Variable(tf.constant(0.1, shape=[519]),name="b_mlp_2") 
      tf.summary.histogram("W_mlp_2", W_mlp_2)   

      W_conv1 = tf.Variable(tf.truncated_normal([3,120,1,3],stddev=0.1), name="W_conv1") 
      b_conv1 = tf.Variable(tf.constant(0.1, shape=[3]),name="b_conv1")
      tf.summary.histogram("W_conv1", W_conv1)
      W_conv2 = tf.Variable(tf.truncated_normal([3,120,3,6],stddev=0.1),name="W_conv2") 
      b_conv2 = tf.Variable(tf.constant(0.1, shape=[6]),name="b_conv2")
      tf.summary.histogram("W_conv2", W_conv2)

      sm_w = tf.Variable(tf.truncated_normal([1239, 10], stddev= 0.1), name="sm_w")
      sm_b = tf.Variable(tf.constant(0.1, shape=[10]),name="sm_b")
      tf.summary.histogram("softmax_weights", sm_w)

      x_cnn = tf.placeholder(tf.float32, [None, 200,120,1], name="x_cnn")
      x_mlp = tf.placeholder(tf.float32, [None, 519,1,1], name="x_mlp")
      y_ = tf.placeholder(tf.float32, [None, 10], name="y_")
      tf.summary.image("x_cnn", x_cnn)
      tf.summary.image("x_mlp", x_mlp)

      x_mlp_new = tf.reshape(x_mlp, [-1, 519])
      h_mlp_1 = tf.nn.xw_plus_b(x_mlp_new, W_mlp_1, b_mlp_1)
      h_mlp_2 = tf.nn.xw_plus_b(h_mlp_1, W_mlp_2, b_mlp_2)
      h_conv1 = tf.nn.relu(conv2d(x_cnn, W_conv1) + b_conv1)
      h_pool1 = max_pool_1(h_conv1)
      h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
      h_pool2 = max_pool_2(h_conv2)
      h_conv2_flat = tf.reshape(h_pool2, [-1, 120*6])

      h_inter = tf.concat([h_mlp_2, h_conv2_flat],1)
      y = tf.nn.softmax(tf.nn.xw_plus_b(h_inter, sm_w, sm_b))

      global_step = tf.Variable(0)
      loss = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_))
      tf.summary.scalar("loss", loss)
      train_op = tf.train.AdagradOptimizer(0.001).minimize(
          loss, global_step=global_step)

      label = tf.argmax(y_, 1, name="label")
      prediction = tf.argmax(y, 1,name="prediction")
      correct_prediction = tf.equal(prediction, label)
      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
      tf.summary.scalar("acc", accuracy)

      saver = tf.train.Saver()
      summary_op = tf.summary.merge_all()
      init_op = tf.global_variables_initializer()

    logdir = TFNode.hdfs_path(ctx, args.model)
    print("tensorflow model path: {0}".format(logdir))
    summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

    if args.mode == "train":
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               init_op=init_op,
                               summary_op=None,
                               saver=saver,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=10)
    else:
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               summary_op=None,
                               saver=saver,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=0)

    with sv.managed_session(server.target) as sess:
      print("{0} session ready".format(datetime.now().isoformat()))
      step = 0
      tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")
      while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
        batch_mlp, batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
        feed = {x_mlp: batch_mlp, x_cnn: batch_xs, y_: batch_ys}

        if len(batch_xs) > 0:
          if args.mode == "train":
            _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
            if (step % 10 == 0):
              print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x_mlp: batch_mlp, x_cnn: batch_xs, y_: batch_ys})))
            if sv.is_chief:
              summary_writer.add_summary(summary, step)
          
          elif args.mode == "inference": 
            labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed)
            results = ["Label: {0}, Prediction: {1}".format(l, p) for l,p in zip(labels,preds)]
            tf_feed.batch_results(results)
            print("acc: {0}".format(acc))

          else:
            preds= sess.run(prediction, feed_dict={x_mlp: batch_mlp, x_cnn: batch_xs})
            results = ["Sha256: {0}, Prediction: {1}".format(l, p) for l,p in zip(batch_ys,preds)]
            tf_feed.batch_results(results)
            print(results)
            
      if sv.should_stop() or step >= args.steps:
        tf_feed.terminate()

    print("{0} stopping supervisor".format(datetime.now().isoformat()))
    sv.stop()
예제 #29
0
def main_fun(args, ctx):
    import tensorflow as tf
    import argparse
    import time
    import os
    from six.moves import cPickle
    from model import Model
    from tensorflowonspark import TFNode
    from datetime import datetime
    import numpy as np

    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index
    cluster_spec = ctx.cluster_spec
    num_workers = len(cluster_spec['worker'])

    # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
    if job_name == "ps":
        time.sleep((worker_num + 1) * 5)

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

    if job_name == "ps":
        server.join()
    else:
        with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:%d" % task_index,
                                                    cluster=cluster)):
            model = Model(args)
            # instrument for tensorboard
            saver = tf.train.Saver()
            summary_op = tf.summary.merge_all()
            init_op = tf.global_variables_initializer()

        logdir = TFNode.hdfs_path(args.save_dir, ctx.defaultFS, ctx.working_dir)

        print("tensorflow model path: {0}".format(logdir))

        summary_writer = TFNode.get_summary_writer(ctx)

        sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                logdir=logdir,
                                init_op=init_op,
                                summary_op=None,
                                saver=saver,
                                global_step=model.global_step,
                                stop_grace_secs=300, save_model_secs=10)

        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
        with sv.managed_session(server.target) as sess:
            print("{0} session ready".format(
                datetime.now().isoformat()))

            state=sess.run(model.initial_state)

            # Loop until the supervisor shuts down or 1000000 steps have completed.
            step=0
            tf_feed=TFNode.DataFeed(ctx.mgr, True)
            while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.

                # using feed_dict
                batch = tf_feed.next_batch(args.batch_size)
                batch_xs = np.asarray([data[0] for data in batch])
                batch_ys = np.asarray([data[1] for data in batch])

                feed={model.input_data: batch_xs, model.targets: batch_ys}

                for i, (c, h) in enumerate(model.initial_state):
                    feed[c]=state[i].c
                    feed[h]=state[i].h

                if len(batch_xs) > 0:
                    # instrument for tensorboard
                    summ, train_loss, state, _, step = sess.run(
                        [summary_op, model.cost, model.final_state, model.train_op, model.global_step], feed_dict=feed)

                    # print loss
                    print("Step: {}, train_loss: {}".format(step, train_loss))

                if sv.is_chief:
                    summary_writer.add_summary(summ, step)

            if sv.should_stop() or step >= args.steps:
                tf_feed.terminate()

        # Ask for all the services to stop.
        print("{0} stopping supervisor".format(datetime.now().isoformat()))
        sv.stop()
예제 #30
0
def main_fun(args, ctx):
    import numpy
    import os
    import tensorflow as tf
    import tensorflow.contrib.keras as keras
    from tensorflow.contrib.keras.api.keras import backend as K
    from tensorflow.contrib.keras.api.keras.models import Sequential, load_model, save_model
    from tensorflow.contrib.keras.api.keras.layers import Dense, Dropout
    from tensorflow.contrib.keras.api.keras.optimizers import RMSprop
    from tensorflow.contrib.keras.python.keras.callbacks import LambdaCallback, TensorBoard

    from tensorflow.python.saved_model import builder as saved_model_builder
    from tensorflow.python.saved_model import tag_constants
    from tensorflow.python.saved_model.signature_def_utils_impl import predict_signature_def

    from tensorflowonspark import TFNode

    cluster, server = TFNode.start_cluster_server(ctx)

    if ctx.job_name == "ps":
        server.join()
    elif ctx.job_name == "worker":

        def generate_rdd_data(tf_feed, batch_size):
            print("generate_rdd_data invoked")
            while True:
                batch = tf_feed.next_batch(batch_size)
                imgs = []
                lbls = []
                for item in batch:
                    imgs.append(item[0])
                    lbls.append(item[1])
                images = numpy.array(imgs).astype('float32') / 255
                labels = numpy.array(lbls).astype('float32')
                yield (images, labels)

        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % ctx.task_index,
                    cluster=cluster)):

            IMAGE_PIXELS = 28
            batch_size = 100
            num_classes = 10

            # the data, shuffled and split between train and test sets
            if args.input_mode == 'tf':
                from tensorflow.contrib.keras.api.keras.datasets import mnist
                (x_train, y_train), (x_test, y_test) = mnist.load_data()
                x_train = x_train.reshape(60000, 784)
                x_test = x_test.reshape(10000, 784)
                x_train = x_train.astype('float32') / 255
                x_test = x_test.astype('float32') / 255

                # convert class vectors to binary class matrices
                y_train = keras.utils.to_categorical(y_train, num_classes)
                y_test = keras.utils.to_categorical(y_test, num_classes)
            else:  # args.mode == 'spark'
                x_train = tf.placeholder(tf.float32,
                                         [None, IMAGE_PIXELS * IMAGE_PIXELS],
                                         name="x_train")
                y_train = tf.placeholder(tf.float32, [None, 10],
                                         name="y_train")

            model = Sequential()
            model.add(Dense(512, activation='relu', input_shape=(784, )))
            model.add(Dropout(0.2))
            model.add(Dense(512, activation='relu'))
            model.add(Dropout(0.2))
            model.add(Dense(10, activation='softmax'))

            model.summary()

            model.compile(loss='categorical_crossentropy',
                          optimizer=RMSprop(),
                          metrics=['accuracy'])

        saver = tf.train.Saver()

        with tf.Session(server.target) as sess:
            K.set_session(sess)

            def save_checkpoint(epoch, logs=None):
                if epoch == 1:
                    tf.train.write_graph(sess.graph.as_graph_def(),
                                         args.model_dir, 'graph.pbtxt')
                saver.save(sess,
                           os.path.join(args.model_dir, 'model.ckpt'),
                           global_step=epoch * args.steps_per_epoch)

            ckpt_callback = LambdaCallback(on_epoch_end=save_checkpoint)
            tb_callback = TensorBoard(log_dir=args.model_dir,
                                      histogram_freq=1,
                                      write_graph=True,
                                      write_images=True)

            # add callbacks to save model checkpoint and tensorboard events (on worker:0 only)
            callbacks = [ckpt_callback, tb_callback
                         ] if ctx.task_index == 0 else None

            if args.input_mode == 'tf':
                # train & validate on in-memory data
                history = model.fit(x_train,
                                    y_train,
                                    batch_size=batch_size,
                                    epochs=args.epochs,
                                    verbose=1,
                                    validation_data=(x_test, y_test),
                                    callbacks=callbacks)
            else:  # args.input_mode == 'spark':
                # train on data read from a generator which is producing data from a Spark RDD
                tf_feed = TFNode.DataFeed(ctx.mgr)
                history = model.fit_generator(
                    generator=generate_rdd_data(tf_feed, batch_size),
                    steps_per_epoch=args.steps_per_epoch,
                    epochs=args.epochs,
                    verbose=1,
                    callbacks=callbacks)

            if args.export_dir and ctx.job_name == 'worker' and ctx.task_index == 0:
                # save a local Keras model, so we can reload it with an inferencing learning_phase
                save_model(model, "tmp_model")

                # reload the model
                K.set_learning_phase(False)
                new_model = load_model("tmp_model")

                # export a saved_model for inferencing
                builder = saved_model_builder.SavedModelBuilder(
                    args.export_dir)
                signature = predict_signature_def(
                    inputs={'images': new_model.input},
                    outputs={'scores': new_model.output})
                builder.add_meta_graph_and_variables(
                    sess=sess,
                    tags=[tag_constants.SERVING],
                    signature_def_map={'predict': signature},
                    clear_devices=True)
                builder.save()

            if args.input_mode == 'spark':
                tf_feed.terminate()