def test_inputmode_spark(self):
        """Distributed TF cluster w/ InputMode.SPARK"""
        def _map_fun(args, ctx):
            import tensorflow as tf

            tf_feed = TFNode.DataFeed(ctx.mgr, False)
            while not tf_feed.should_stop():
                batch = tf_feed.next_batch(batch_size=10)
                print("batch: {}".format(batch))
                squares = tf.math.square(batch)
                print("squares: {}".format(squares))
                tf_feed.batch_results(squares.numpy())

        input = [[x] for x in range(1000)
                 ]  # set up input as tensors of shape [1] to match placeholder
        rdd = self.sc.parallelize(input, 10)
        cluster = TFCluster.run(self.sc,
                                _map_fun,
                                tf_args={},
                                num_executors=self.num_workers,
                                num_ps=0,
                                input_mode=TFCluster.InputMode.SPARK)
        rdd_out = cluster.inference(rdd)
        rdd_sum = rdd_out.sum()
        self.assertEqual(rdd_sum, sum([x * x for x in range(1000)]))
        cluster.shutdown()
示例#2
0
 def train(self,
           data_rdd,
           model_rdd,
           batch_size,
           epochs,
           model_dir,
           go_on=False):
     n_samples = data_rdd.count()
     # steps_per_epoch = n_samples // batch_size // self.num_workers
     steps_per_epoch = math.ceil(n_samples / batch_size / self.num_workers)
     assert steps_per_epoch > 0
     md = ModelDir(model_dir, 'train*')
     if go_on:
         md.create_model_dir()
     else:
         md = md.rebuild_model_dir()
     worker = TFTrainWorker(model_rdd,
                            go_on=go_on,
                            batch_size=batch_size,
                            epochs=epochs,
                            steps_per_epoch=steps_per_epoch,
                            **md.to_dict())
     cluster = TFCluster.run(self.sc,
                             worker,
                             self.tf_args,
                             self.cluster_size,
                             self.num_ps,
                             input_mode=self.input_mode)
     cluster.train(data_rdd.rdd, num_epochs=epochs, feed_timeout=60000)
     cluster.shutdown()
     results = md.read_result()
     return self.sqlc.createDataFrame(results)
  def test_inputmode_spark(self):
    """Distributed TF cluster w/ InputMode.SPARK"""
    def _map_fun(args, ctx):
      import tensorflow as tf
      cluster, server = TFNode.start_cluster_server(ctx)
      if ctx.job_name == "ps":
        server.join()
      elif ctx.job_name == "worker":
        with tf.device(tf.train.replica_device_setter(
          worker_device="/job:worker/task:%d" % ctx.task_index,
          cluster=cluster)):
          x = tf.placeholder(tf.int32, [None, 1])
          sq = tf.square(x)
          init_op = tf.global_variables_initializer()
        sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0),
                                init_op=init_op)
        with sv.managed_session(server.target) as sess:
          tf_feed = TFNode.DataFeed(ctx.mgr, False)
          while not sv.should_stop() and not tf_feed.should_stop():
            outputs = sess.run([sq], feed_dict={ x: tf_feed.next_batch(10) })
            tf_feed.batch_results(outputs[0])
        sv.stop()

    input = [ [x] for x in range(1000) ]    # set up input as tensors of shape [1] to match placeholder
    rdd = self.sc.parallelize(input, 10)
    cluster = TFCluster.run(self.sc, _map_fun, tf_args={}, num_executors=self.num_workers, num_ps=0, input_mode=TFCluster.InputMode.SPARK)
    rdd_out = cluster.inference(rdd)
    rdd_sum = rdd_out.sum()
    self.assertEqual(sum( [x * x for x in range(1000)] ), rdd_sum)
    cluster.shutdown()
    def test_port_unreleased(self):
        """Test that temporary socket/port is unreleased prior to invoking user map_fun."""
        def _map_fun(args, ctx):
            import socket
            assert ctx.tmp_socket is not None
            reserved_port = ctx.tmp_socket.getsockname()[1]

            # socket bind to tmp port should fail
            try:
                my_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                my_sock.bind(('0.0.0.0', reserved_port))
                assert False, "should never hit this assert statement"
            except socket.error as e:
                print(e)
                assert True, "should raise an exception"

            ctx.release_port()
            assert ctx.tmp_socket is None

        cluster = TFCluster.run(self.sc,
                                _map_fun,
                                tf_args={},
                                num_executors=self.num_workers,
                                num_ps=0,
                                input_mode=TFCluster.InputMode.TENSORFLOW,
                                master_node='chief',
                                release_port=False)
        cluster.shutdown()
示例#5
0
    def run(self, model_fn, args):
        from tensorflowonspark import TFCluster

        self.set_graph_modules(model_fn)

        config = cp.ConfigParser()
        config.readfp(open('{PROJECT_ROOT}/defaults.cfg'.format(**os.environ)))

        project = config.get('gcp', 'project')
        keyfile = "/etl/credentials/bi-service-155107.json"
        app_name = args.app_name

        submit_host = config.get('environment', 'submit_host')

        python_lib = config.get('environment', 'python_lib')
        python_files = utils.get_list(config.get('environment',
                                                 'python_files'))
        sc = utils.get_context(app_name, project, keyfile, submit_host,
                               python_lib, python_files)

        # tf.app.run()
        cluster = TFCluster.run(sc,
                                self.execute,
                                args,
                                args.cluster_size,
                                args.num_ps,
                                tensorboard=args.tensorboard,
                                input_mode=TFCluster.InputMode.TENSORFLOW,
                                log_dir=args.job_dir,
                                master_node='master',
                                reservation_timeout=1800)

        cluster.shutdown()
    def test_inputmode_spark_exception(self):
        """Distributed TF cluster w/ InputMode.SPARK and exception during feeding"""
        def _map_fun(args, ctx):
            import tensorflow as tf

            tf_feed = TFNode.DataFeed(ctx.mgr, False)
            while not tf_feed.should_stop():
                batch = tf_feed.next_batch(10)
                if len(batch) > 0:
                    squares = tf.math.square(batch)
                    tf_feed.batch_results(squares.numpy())
                    raise Exception("FAKE exception during feeding")

        input = [[x] for x in range(1000)
                 ]  # set up input as tensors of shape [1] to match placeholder
        rdd = self.sc.parallelize(input, 10)
        with self.assertRaises(Exception):
            cluster = TFCluster.run(self.sc,
                                    _map_fun,
                                    tf_args={},
                                    num_executors=self.num_workers,
                                    num_ps=0,
                                    input_mode=TFCluster.InputMode.SPARK)
            cluster.inference(rdd, feed_timeout=1).count()
            cluster.shutdown()
示例#7
0
  def test_inputmode_spark(self):
    """Distributed TF cluster w/ InputMode.SPARK"""
    def _map_fun(args, ctx):
      import tensorflow as tf
      cluster, server = TFNode.start_cluster_server(ctx)
      if ctx.job_name == "ps":
        server.join()
      elif ctx.job_name == "worker":
        with tf.device(tf.train.replica_device_setter(
          worker_device="/job:worker/task:%d" % ctx.task_index,
          cluster=cluster)):
          x = tf.placeholder(tf.int32, [None, 1])
          sq = tf.square(x)
          init_op = tf.global_variables_initializer()
        with tf.train.MonitoredTrainingSession(is_chief=(ctx.task_index == 0)) as sess:
          tf_feed = TFNode.DataFeed(ctx.mgr, False)
          while not sess.should_stop() and not tf_feed.should_stop():
            outputs = sess.run([sq], feed_dict={x: tf_feed.next_batch(10)})
            tf_feed.batch_results(outputs[0])

    input = [[x] for x in range(1000)]    # set up input as tensors of shape [1] to match placeholder
    rdd = self.sc.parallelize(input, 10)
    cluster = TFCluster.run(self.sc, _map_fun, tf_args={}, num_executors=self.num_workers, num_ps=0, input_mode=TFCluster.InputMode.SPARK)
    rdd_out = cluster.inference(rdd)
    rdd_sum = rdd_out.sum()
    self.assertEqual(rdd_sum, sum([x * x for x in range(1000)]))
    cluster.shutdown()
    def test_inputmode_spark_late_exception(self):
        """Distributed TF cluster w/ InputMode.SPARK and exception after feeding"""
        def _map_fun(args, ctx):
            import tensorflow as tf

            tf_feed = TFNode.DataFeed(ctx.mgr, False)
            while not tf_feed.should_stop():
                batch = tf_feed.next_batch(10)
                if len(batch) > 0:
                    squares = tf.math.square(batch)
                    tf_feed.batch_results(squares.numpy())

            # simulate post-feed actions that raise an exception
            time.sleep(2)
            raise Exception("FAKE exception after feeding")

        input = [[x] for x in range(1000)
                 ]  # set up input as tensors of shape [1] to match placeholder
        rdd = self.sc.parallelize(input, 10)
        with self.assertRaises(Exception):
            cluster = TFCluster.run(self.sc,
                                    _map_fun,
                                    tf_args={},
                                    num_executors=self.num_workers,
                                    num_ps=0,
                                    input_mode=TFCluster.InputMode.SPARK)
            cluster.inference(rdd).count()
            cluster.shutdown(
                grace_secs=5
            )  # note: grace_secs must be larger than the time needed for post-feed actions
示例#9
0
文件: cgan_mlp.py 项目: linxigal/tfos
 def train(self, data, output_path, steps, batch_size):
     checkpoint_path = os.path.join(output_path, 'checkpoint')
     if not tf.gfile.Exists(checkpoint_path):
         tf.gfile.MkDir(checkpoint_path)
     result_path = os.path.join(output_path, 'results')
     if not tf.gfile.Exists(result_path):
         tf.gfile.MkDir(result_path)
     worker = CGAN_MLP(data, result_path, checkpoint_path, steps, batch_size)
     cluster = TFCluster.run(self.sc, worker, None, self.cluster_size, self.num_ps, input_mode=self.input_mode)
     cluster.shutdown()
示例#10
0
文件: tfos.py 项目: linxigal/tfos
 def evaluate(self, data_rdd, steps, model_dir):
     md = ModelDir(model_dir, 'evaluate*')
     steps_per_epoch = data_rdd.count() if steps <= 0 else steps
     steps_per_epoch = math.ceil(steps_per_epoch / self.num_workers)
     worker = EvaluateWorker(steps_per_epoch=steps_per_epoch, **md.to_dict())
     md.delete_result_file()
     cluster = TFCluster.run(self.sc, worker, self.tf_args, self.cluster_size, self.num_ps,
                             input_mode=self.input_mode)
     cluster.train(data_rdd.rdd, num_epochs=1)
     cluster.shutdown()
     results = md.read_result()
     return self.sqlc.createDataFrame(results)
示例#11
0
文件: tfos.py 项目: linxigal/tfos
 def recurrent_predict(self, data_rdd, units, steps, feature_type, model_dir):
     md = ModelDir(model_dir, 'recurrent_predict*')
     worker = RecurrentPredictWorker(units=units,
                                     steps=steps,
                                     feature_type=feature_type,
                                     **md.to_dict())
     md.delete_result_file()
     cluster = TFCluster.run(self.sc, worker, self.tf_args, self.cluster_size, self.num_ps,
                             input_mode=self.input_mode)
     cluster.train(data_rdd.rdd, num_epochs=1, feed_timeout=6000)
     cluster.shutdown()
     results = md.read_result(True)
     return self.sqlc.createDataFrame([{"result": result} for result in results])
    def test_port_released(self):
        """Test that temporary socket/port is released prior to invoking user map_fun."""
        def _map_fun(args, ctx):
            assert ctx.tmp_socket is None

        cluster = TFCluster.run(self.sc,
                                _map_fun,
                                tf_args={},
                                num_executors=self.num_workers,
                                num_ps=0,
                                input_mode=TFCluster.InputMode.TENSORFLOW,
                                master_node='chief')
        cluster.shutdown()
  def test_basic_tf(self):
    """Single-node TF graph (w/ args) running independently on multiple executors."""
    def _map_fun(args, ctx):
      import tensorflow as tf
      x = tf.constant(args['x'])
      y = tf.constant(args['y'])
      sum = tf.add(x,y)
      with tf.Session() as sess:
        result = sess.run([sum])
        assert result[0] == 3

    args = { 'x':1, 'y':2 }
    cluster = TFCluster.run(self.sc, _map_fun, tf_args=args, num_executors=self.num_workers, num_ps=0)
    cluster.shutdown()
示例#14
0
  def test_basic_tf(self):
    """Single-node TF graph (w/ args) running independently on multiple executors."""
    def _map_fun(args, ctx):
      import tensorflow as tf
      x = tf.constant(args['x'])
      y = tf.constant(args['y'])
      sum = tf.add(x, y)
      with tf.Session() as sess:
        result = sess.run([sum])
        assert result[0] == 3

    args = {'x': 1, 'y': 2}
    cluster = TFCluster.run(self.sc, _map_fun, tf_args=args, num_executors=self.num_workers, num_ps=0)
    cluster.shutdown()
示例#15
0
文件: tfos.py 项目: linxigal/tfos
 def predict(self, data_rdd, steps, model_dir, output_prob=False):
     md = ModelDir(model_dir, 'predict*')
     steps_per_epoch = data_rdd.count() if steps <= 0 else steps
     steps_per_epoch = math.ceil(steps_per_epoch / self.num_workers)
     worker = PredictWorker(steps_per_epoch=steps_per_epoch,
                            output_prob=output_prob,
                            **md.to_dict())
     md.delete_result_file()
     cluster = TFCluster.run(self.sc, worker, self.tf_args, self.cluster_size, self.num_ps,
                             input_mode=self.input_mode)
     cluster.train(data_rdd.rdd, num_epochs=1, feed_timeout=6000)
     cluster.shutdown()
     results = md.read_result()
     return self.sqlc.createDataFrame(results)
示例#16
0
文件: tfos.py 项目: linxigal/tfos
 def yolov3_tiny_train(self,
                       model_rdd,
                       batch_size,
                       epochs,
                       classes_path,
                       anchors_path,
                       train_path,
                       val_path,
                       image_size,
                       model_dir,
                       weights_path=None,
                       freeze_body=2,
                       go_on=False):
     columns = model_rdd.columns
     assert "model_config" in columns, "not exists model layer config!"
     assert tf.io.gfile.exists(train_path), "train dataset path not exists!"
     data_rdd = self.sc.textFile(train_path)
     n_samples = data_rdd.count()
     steps_per_epoch = math.ceil(n_samples / batch_size / self.num_workers)
     md = ModelDir(model_dir, 'train*')
     if go_on:
         md.create_model_dir()
     else:
         md = md.rebuild_model_dir()
     worker = YOLOV3TinyModelTrainWorker(model_rdd,
                                         go_on=go_on,
                                         batch_size=batch_size,
                                         epochs=epochs,
                                         classes_path=classes_path,
                                         anchors_path=anchors_path,
                                         weights_path=weights_path,
                                         val_path=val_path,
                                         image_size=image_size,
                                         steps_per_epoch=steps_per_epoch,
                                         freeze_body=freeze_body,
                                         **md.to_dict())
     cluster = TFCluster.run(self.sc,
                             worker,
                             self.tf_args,
                             self.cluster_size,
                             self.num_ps,
                             input_mode=self.input_mode)
     cluster.train(data_rdd, num_epochs=epochs, feed_timeout=60000)
     cluster.shutdown()
     results = md.read_result()
     return self.sqlc.createDataFrame(results)
    def test_inputmode_spark_late_exception(self):
        """Distributed TF cluster w/ InputMode.SPARK and exception after feeding"""
        def _map_fun(args, ctx):
            import tensorflow as tf
            cluster, server = TFNode.start_cluster_server(ctx)
            if ctx.job_name == "ps":
                server.join()
            elif ctx.job_name == "worker":
                with tf.device(
                        tf.train.replica_device_setter(
                            worker_device="/job:worker/task:%d" %
                            ctx.task_index,
                            cluster=cluster)):
                    x = tf.placeholder(tf.int32, [None, 1])
                    sq = tf.square(x)
                    init_op = tf.global_variables_initializer()
                with tf.train.MonitoredTrainingSession(
                        is_chief=(ctx.task_index == 0)) as sess:
                    tf_feed = TFNode.DataFeed(ctx.mgr, False)
                    while not sess.should_stop() and not tf_feed.should_stop():
                        batch = tf_feed.next_batch(10)
                        if len(batch) > 0:
                            outputs = sess.run([sq], feed_dict={x: batch})
                            tf_feed.batch_results(outputs[0])

                # simulate post-feed actions that raise an exception
                time.sleep(2)
                raise Exception("FAKE exception after feeding")

        input = [[x] for x in range(1000)
                 ]  # set up input as tensors of shape [1] to match placeholder
        rdd = self.sc.parallelize(input, 10)
        with self.assertRaises(Exception):
            cluster = TFCluster.run(self.sc,
                                    _map_fun,
                                    tf_args={},
                                    num_executors=self.num_workers,
                                    num_ps=0,
                                    input_mode=TFCluster.InputMode.SPARK)
            cluster.inference(rdd).count()
            cluster.shutdown(
                grace_secs=5
            )  # note: grace_secs must be larger than the time needed for post-feed actions
示例#18
0
文件: tfos.py 项目: linxigal/tfos
 def yolov3_train(self,
                  model_rdd,
                  data_dir,
                  batch_size,
                  epochs,
                  image_size,
                  model_dir,
                  weights_path=None,
                  freeze_body=2,
                  go_on=False):
     train_path = os.path.join(data_dir, 'train.txt')
     assert tf.io.gfile.exists(train_path), "train dataset path not exists!"
     data_rdd = self.sc.textFile(train_path)
     n_samples = data_rdd.count()
     steps_per_epoch = math.ceil(n_samples / batch_size / self.num_workers)
     md = ModelDir(model_dir, 'train*')
     if go_on:
         md.create_model_dir()
     else:
         md = md.rebuild_model_dir()
     worker = YOLOV3ModelTrainWorker(model_rdd,
                                     data_dir,
                                     go_on=go_on,
                                     batch_size=batch_size,
                                     epochs=epochs,
                                     image_size=image_size,
                                     steps_per_epoch=steps_per_epoch,
                                     freeze_body=freeze_body,
                                     **md.to_dict())
     cluster = TFCluster.run(self.sc,
                             worker,
                             self.tf_args,
                             self.cluster_size,
                             self.num_ps,
                             input_mode=self.input_mode)
     cluster.train(data_rdd, num_epochs=epochs, feed_timeout=60000)
     cluster.shutdown()
     results = md.read_result()
     if results:
         return self.sqlc.createDataFrame(results)
示例#19
0
文件: mtcnn.py 项目: linxigal/tfos
 def run(self, input_dir, output_dir, *args, **kwargs):
     out_text_dir = os.path.join(output_dir, 'text')
     out_image_dir = os.path.join(output_dir, 'images')
     out_result_dir = os.path.join(output_dir, 'result')
     if tf.io.gfile.exists(out_text_dir):
         tf.io.gfile.rmtree(out_text_dir)
     if tf.io.gfile.exists(out_image_dir):
         tf.io.gfile.rmtree(out_image_dir)
     tf.io.gfile.makedirs(out_text_dir)
     tf.io.gfile.makedirs(out_image_dir)
     tf.io.gfile.makedirs(out_result_dir)
     dataset = facenet.get_dataset(input_dir)
     data_rdd = self.sc.parallelize([(cls.name, cls.image_paths)
                                     for cls in dataset])
     worker = MTCNNWorker(out_text_dir, out_image_dir, out_result_dir,
                          *args, **kwargs)
     cluster = TFCluster.run(self.sc,
                             worker,
                             self.tf_args,
                             self.cluster_size,
                             self.num_ps,
                             input_mode=self.input_mode)
     cluster.train(data_rdd, feed_timeout=60000)
     cluster.shutdown()
示例#20
0
                for epoch in range(FLAGS.num_epoch):
                    train_batches = train_reader.yieldBatches()
                    print("Epoch: %d" % epoch)
                    step = 0
                    for dense_x,sparse_idx,sparse_values,y in train_batches:
                        start_time = datetime.now()
                        _ ,train_loss,train_auc,summ,_ = sess.run([train_op,loss,auc_op,summary_op,step_update_op],
                           feed_dict={dense_inputs:dense_x,sparse_inputs:(sparse_idx,sparse_values,shape),labels:y})
                        step += 1
                        assert not np.isnan(train_loss), 'Model diverged with loss = NaN'
                        time_used = datetime.now() - start_time
                        if step % FLAGS.display_step == 0:
                            g_step, = sess.run([global_step])
                            print("step: " + str(step) + ", global_step: " + str(g_step))
                            summary_writer.add_summary(summ,g_step)
                            print("Step = {}, Examples = {}, Time = {}, Minibatch Loss = {}, Auc = {}".format(
                                 g_step, g_step*FLAGS.batch_size, time_used, train_loss, train_auc))
                            sys.stdout.flush()
            total_time = datetime.now() - begin_time
            print("Training Done!!")
            print("Total time used: {}".format(total_time))


if __name__ == "__main__":
    sc = SparkContext(conf=SparkConf().setAppName("tfos_online_train_distributed"))
    num_executors = int(sc._conf.get("spark.executor.instances"))
    num_ps = 64
    tensorboard = False
    cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, tensorboard, TFCluster.InputMode.TENSORFLOW)
    cluster.shutdown()
import mnist_dist

sc = SparkContext(conf=SparkConf().setAppName("mnist_tf"))
executors = sc._conf.get("spark.executor.instances")
num_executors = int(executors) if executors is not None else 1
num_ps = 1

parser = argparse.ArgumentParser()
parser.add_argument("-e", "--epochs", help="number of epochs", type=int, default=0)
parser.add_argument("-f", "--format", help="example format: (csv|pickle|tfr)", choices=["csv","pickle","tfr"], default="tfr")
parser.add_argument("-i", "--images", help="HDFS path to MNIST images in parallelized format")
parser.add_argument("-l", "--labels", help="HDFS path to MNIST labels in parallelized format")
parser.add_argument("-m", "--model", help="HDFS path to save/load model during train/test", default="mnist_model")
parser.add_argument("-n", "--cluster_size", help="number of nodes in the cluster (for Spark Standalone)", type=int, default=num_executors)
parser.add_argument("-o", "--output", help="HDFS path to save test/inference output", default="predictions")
parser.add_argument("-r", "--readers", help="number of reader/enqueue threads", type=int, default=1)
parser.add_argument("-s", "--steps", help="maximum number of steps", type=int, default=1000)
parser.add_argument("-tb", "--tensorboard", help="launch tensorboard process", action="store_true")
parser.add_argument("-X", "--mode", help="train|inference", default="train")
parser.add_argument("-c", "--rdma", help="use rdma connection", default=False)
args = parser.parse_args()
print("args:",args)


print("{0} ===== Start".format(datetime.now().isoformat()))
cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.TENSORFLOW)
cluster.shutdown()

print("{0} ===== Stop".format(datetime.now().isoformat()))

parser.add_argument("--rdma", help="use rdma connection", default=False)
parser.add_argument("--readers",
                    help="number of reader/enqueue threads per worker",
                    type=int,
                    default=10)
parser.add_argument("--shuffle_size",
                    help="size of shuffle buffer",
                    type=int,
                    default=1000)
parser.add_argument("--steps",
                    help="maximum number of steps",
                    type=int,
                    default=1000)
parser.add_argument("--tensorboard",
                    help="launch tensorboard process",
                    action="store_true")
args = parser.parse_args()
print("args:", args)

print("{0} ===== Start".format(datetime.now().isoformat()))
cluster = TFCluster.run(sc,
                        mnist_dist.map_fun,
                        args,
                        args.cluster_size,
                        args.num_ps,
                        args.tensorboard,
                        TFCluster.InputMode.TENSORFLOW,
                        driver_ps_nodes=args.driver_ps_nodes)
cluster.shutdown()
print("{0} ===== Stop".format(datetime.now().isoformat()))
      if not tf.gfile.Exists(FLAGS.train_dir):
        tf.gfile.MakeDirs(FLAGS.train_dir)
    inception_distributed_train.train(server.target, dataset, cluster_spec, ctx)

if __name__ == '__main__':
  # parse arguments needed by the Spark driver
  import argparse
  parser = argparse.ArgumentParser()
  parser.add_argument("--epochs", help="number of epochs", type=int, default=0)
  parser.add_argument("--input_data", help="HDFS path to input dataset")
  parser.add_argument("--input_mode", help="method to ingest data: (spark|tf)", choices=["spark","tf"], default="tf")
  parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true")

  (args,rem) = parser.parse_known_args()

  input_mode = TFCluster.InputMode.SPARK if args.input_mode == 'spark' else TFCluster.InputMode.TENSORFLOW

  print("{0} ===== Start".format(datetime.now().isoformat()))
  sc = SparkContext(conf=SparkConf().setAppName('imagenet_distributed_train'))
  num_executors = int(sc._conf.get("spark.executor.instances"))
  num_ps = 1

  cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, args.tensorboard, input_mode)
  if input_mode == TFCluster.InputMode.SPARK:
    dataRDD = sc.newAPIHadoopFile(args.input_data, "org.tensorflow.hadoop.io.TFRecordFileInputFormat",
                                keyClass="org.apache.hadoop.io.BytesWritable",
                                valueClass="org.apache.hadoop.io.NullWritable")
    cluster.train(dataRDD, args.epochs)
  cluster.shutdown()
  print("{0} ===== Stop".format(datetime.now().isoformat()))
        duration = time.time() - self._start_time
        loss_value = run_values.results
        if self._step % 10 == 0:
          num_examples_per_step = FLAGS.batch_size
          examples_per_sec = num_examples_per_step / duration
          sec_per_batch = float(duration)

          format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
          print (format_str % (datetime.now(), self._step, loss_value,
                               examples_per_sec, sec_per_batch))

    with tf.train.MonitoredTrainingSession(
        checkpoint_dir=FLAGS.train_dir,
        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
               tf.train.NanTensorHook(loss),
               _LoggerHook()],
        config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement)) as mon_sess:
      while not mon_sess.should_stop():
        mon_sess.run(train_op)


if __name__ == '__main__':
  sc = SparkContext(conf=SparkConf().setAppName("cifar10_train"))
  num_executors = int(sc._conf.get("spark.executor.instances"))
  num_ps = 0

  cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, False, TFCluster.InputMode.TENSORFLOW)
  cluster.shutdown()
示例#25
0
    parser.add_argument("--epochs",
                        help="number of epochs",
                        type=int,
                        default=3)
    parser.add_argument("--model_dir",
                        help="path to save model/checkpoint",
                        default="mnist_model")
    parser.add_argument("--export_dir",
                        help="path to export saved_model",
                        default="mnist_export")
    parser.add_argument("--steps_per_epoch",
                        help="number of steps per epoch",
                        type=int,
                        default=469)
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")

    args = parser.parse_args()
    print("args:", args)

    cluster = TFCluster.run(sc,
                            main_fun,
                            args,
                            args.cluster_size,
                            num_ps=0,
                            tensorboard=args.tensorboard,
                            input_mode=TFCluster.InputMode.TENSORFLOW,
                            master_node='chief')
    cluster.shutdown()
示例#26
0
    # arguments for Spark and TFoS
    parser = argparse.ArgumentParser()
    parser.add_argument("--cluster_size",
                        help="number of nodes in the cluster",
                        type=int,
                        default=executors)
    parser.add_argument("--num_ps",
                        help="number of ps nodes",
                        type=int,
                        default=1)
    (args, remainder) = parser.parse_known_args()

    # construct an ARGV (with script name as first element) from remaining args and pass it to the TF processes on executors
    remainder.insert(0, __file__)
    print("spark args:", args)
    print("tf args:", remainder)

    num_workers = args.cluster_size - args.num_ps
    print("===== num_executors={}, num_workers={}, num_ps={}".format(
        args.cluster_size, num_workers, args.num_ps))

    cluster = TFCluster.run(sc,
                            main_fun,
                            remainder,
                            args.cluster_size,
                            args.num_ps,
                            False,
                            TFCluster.InputMode.TENSORFLOW,
                            master_node='master')
    cluster.shutdown()
示例#27
0
                        type=int,
                        default=3)
    parser.add_argument("--learning_rate",
                        help="learning rate",
                        type=float,
                        default=1e-4)
    parser.add_argument("--model_dir",
                        help="path to save checkpoint",
                        default="mnist_model")
    parser.add_argument("--export_dir",
                        help="path to export saved_model",
                        default="mnist_export")
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")

    args = parser.parse_args()
    print("args:", args)

    cluster = TFCluster.run(sc,
                            main_fun,
                            args,
                            args.cluster_size,
                            num_ps=0,
                            tensorboard=args.tensorboard,
                            input_mode=TFCluster.InputMode.TENSORFLOW,
                            log_dir=args.model_dir,
                            master_node='chief',
                            eval_node=True)
    cluster.shutdown(grace_secs=60)
示例#28
0
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Accuracy per epoch train vs test')
    plt.legend()
    plt.grid(True)
    plt.show()

    plt.plot(train_cost[zoom_point:])
    plt.plot(test_cost[zoom_point:])
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Loss per epoch train vs test')
    plt.legend()
    plt.grid(True)
    plt.show()


if __name__ == '__main__':
    # tf.app.run()
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true")
    args, rem = parser.parse_known_args()

    sc = SparkContext(conf=SparkConf().setAppName("your_app_name"))
    num_executors = int(sc._conf.get("spark.executor.instances"))
    num_ps = 1
    tensorboard = True

    cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, tensorboard, TFCluster.InputMode.TENSORFLOW)
    cluster.shutdown()
示例#29
0
    plt.grid(True)
    plt.show()


def main_fun(argv, ctx):
    worker_num = ctx.worker_num
    job_name = ctx.job_name
    print(f"Starting worker {worker_num} on task {job_name}")
    hype_random(worker_num)


if __name__ == '__main__':
    # tf.app.run()
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")
    args, rem = parser.parse_known_args()

    sc = SparkContext(conf=SparkConf().setAppName("lab4_task6"))
    # num_executors = int(sc._conf.get("spark.executor.instances"))
    num_ps = 1
    num_workers = 4
    tensorboard = True

    cluster = TFCluster.run(sc, main_fun, [], num_workers, num_ps, tensorboard,
                            TFCluster.InputMode.TENSORFLOW)
    cluster.shutdown()
示例#30
0
文件: wdl.py 项目: crafet/pluto
                        help="number of ps nodes",
                        type=int,
                        default=1)
    parser.add_argument("--task_num",
                        help="number of worker nodes",
                        type=int,
                        default=1)
    parser.add_argument("--max_steps",
                        help="max number of steps to train",
                        type=int,
                        default=20000)
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")

    args = parser.parse_args()
    print("args:", args)

    assert (args.num_ps + args.task_num == num_executors)

    cluster = TFCluster.run(sc,
                            main_func,
                            args,
                            args.cluster_size,
                            args.num_ps,
                            args.tensorboard,
                            TFCluster.InputMode.TENSORFLOW,
                            log_dir=args.model_dir,
                            master_node='master')
    cluster.shutdown()
示例#31
0
                        type=int,
                        default=num_executors)
    parser.add_argument(
        "--model",
        help="HDFS path to save/load model during train/inference",
        default="mnist_model")
    parser.add_argument("--output",
                        help="HDFS path to save test/inference output",
                        default="predictions")
    parser.add_argument("--num_ps",
                        help="number of PS nodes in cluster",
                        type=int,
                        default=1)
    parser.add_argument("--steps",
                        help="maximum number of steps",
                        type=int,
                        default=1000)
    args = parser.parse_args()
    print("args:", args)

    cluster = TFCluster.run(sc,
                            main,
                            args,
                            args.cluster_size,
                            args.num_ps,
                            tensorboard=False,
                            input_mode=TFCluster.InputMode.TENSORFLOW,
                            log_dir=args.model,
                            master_node='master')
    cluster.shutdown()
          train_tensor,
          logdir=FLAGS.train_dir,
          master=server.target,
          is_chief=(FLAGS.task == 0),
          init_fn=_get_init_fn(),
          summary_op=summary_op,
          number_of_steps=FLAGS.max_number_of_steps,
          log_every_n_steps=FLAGS.log_every_n_steps,
          save_summaries_secs=FLAGS.save_summaries_secs,
          save_interval_secs=FLAGS.save_interval_secs,
          summary_writer=summary_writer,
          sync_optimizer=optimizer if FLAGS.sync_replicas else None)


if __name__ == '__main__':
  import argparse

  sc = SparkContext(conf=SparkConf().setAppName("train_image_classifier"))
  executors = sc._conf.get("spark.executor.instances")
  num_executors = int(executors) if executors is not None else 1

  parser = argparse.ArgumentParser()
  parser.add_argument("--num_ps_tasks", help="number of PS nodes", type=int, default=0)
  parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true")
  parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors)
  (args,rem) = parser.parse_known_args()

  assert(num_executors > args.num_ps_tasks)
  cluster = TFCluster.run(sc, main_fun, sys.argv, args.cluster_size, args.num_ps_tasks, args.tensorboard, TFCluster.InputMode.TENSORFLOW)
  cluster.shutdown()
示例#33
0
                        default="mnist_model")
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")

    args = parser.parse_args()
    print("args:", args)

    # create RDD of input data
    def parse(ln):
        vec = [int(x) for x in ln.split(',')]
        return (vec[1:], vec[0])

    stream = ssc.textFileStream(args.images_labels)
    images_labels = stream.map(parse)

    cluster = TFCluster.run(sc,
                            main_fun,
                            args,
                            args.cluster_size,
                            num_ps=1,
                            tensorboard=args.tensorboard,
                            input_mode=TFCluster.InputMode.SPARK,
                            log_dir=args.model_dir,
                            master_node='chief')
    cluster.train(
        images_labels, feed_timeout=86400
    )  # extend feed timeout to 24hrs for streaming data to arrive
    ssc.start()
    cluster.shutdown(ssc)
示例#34
0
    def toNumpy(bytestr):
        example = tf.train.Example()
        example.ParseFromString(bytestr)
        features = example.features.feature
        image = numpy.array(features['image'].int64_list.value)
        label = numpy.array(features['label'].int64_list.value)
        return (image, label)

    dataRDD = images.map(lambda x: toNumpy(str(x[0])))
else:
    if args.format == "csv":
        images = sc.textFile(
            args.images).map(lambda ln: [int(x) for x in ln.split(',')])
        labels = sc.textFile(
            args.labels).map(lambda ln: [float(x) for x in ln.split(',')])
    else:  # args.format == "pickle":
        images = sc.pickleFile(args.images)
        labels = sc.pickleFile(args.labels)
    print("zipping images and labels")
    dataRDD = images.zip(labels)

cluster = TFCluster.run(sc, cifar100_dist2.map_fun, args, args.cluster_size,
                        num_ps, args.tensorboard, TFCluster.InputMode.SPARK)
if args.mode == "train":
    cluster.train(dataRDD, args.epochs)
else:
    labelRDD = cluster.inference(dataRDD)
    labelRDD.saveAsTextFile(args.output)
cluster.shutdown()

print("{0} ===== Stop".format(datetime.now().isoformat()))
示例#35
0
def main(args=None):

    spark = SparkSession \
      .builder \
      .appName("mitosis_spark") \
      .getOrCreate()
    sc = spark.sparkContext

    executors = sc._conf.get("spark.executor.instances")
    num_executors = int(executors) if executors is not None else 1
    num_ps = 1
    logging.info("============= Num of executors: {0}".format(num_executors))

    # parse args
    parser = argparse.ArgumentParser()
    parser.add_argument("--appName",
                        default="mitosis_spark",
                        help="application name")
    parser.add_argument("--hdfs_host",
                        help="HDFS host",
                        type=str,
                        default="default")
    parser.add_argument("--hdfs_port",
                        help="HDFS port",
                        type=int,
                        default=8020)
    parser.add_argument("--mitosis_img_dir",
                        help="path to the mitosis image files")
    parser.add_argument(
        "--mitosis_img_csv",
        help="csv file that contain all the mitosis image files")
    parser.add_argument("--normal_img_dir",
                        required=True,
                        help="path to the normal image files")
    parser.add_argument(
        "--normal_img_csv",
        help="csv file that contain all the normal image files")

    parser.add_argument("--batch_size",
                        help="number of records per batch",
                        type=int,
                        default=32)
    parser.add_argument("--epochs",
                        help="number of epochs",
                        type=int,
                        default=1)
    parser.add_argument("--export_dir",
                        help="HDFS path to export saved_model",
                        default="mnist_export")
    parser.add_argument("--format",
                        help="example format: (csv|pickle|tfr)",
                        choices=["csv", "pickle", "tfr"],
                        default="csv")
    parser.add_argument(
        "--model",
        help="HDFS path to save/load model during train/inference",
        default="mnist_model")
    parser.add_argument("--cluster_size",
                        help="number of nodes in the cluster",
                        type=int,
                        default=num_executors)
    parser.add_argument("--output",
                        help="HDFS path to save test/inference output",
                        default="predictions")
    parser.add_argument("--readers",
                        help="number of reader/enqueue threads",
                        type=int,
                        default=1)
    parser.add_argument("--steps",
                        help="maximum number of steps",
                        type=int,
                        default=99)
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")
    parser.add_argument("--mode", help="train|inference", default="train")
    parser.add_argument("--rdma", help="use rdma connection", default=False)
    args = parser.parse_args(args)

    if args.mitosis_img_dir is None and args.mitosis_img_csv is None:
        parser.error(
            "at least one of --mitosis_img_dir and --mitosis_img_csv required")

    if args.normal_img_dir is None and args.normal_img_csv is None:
        parser.error(
            "at least one of --normal_img_dir and --normal_img_csv required")

    if args.mitosis_img_csv is None:
        fs = get_hdfs(args.hdfs_host, args.hdfs_port)
        mitosis_img_pathes = fs.ls(args.mitosis_img_dir)
        mitosis_label_img_pathes = [(1, path) for path in mitosis_img_pathes]
        #mitosis_train_rdd = sc.parallelize(mitosis_img_pathes).map(lambda path : (1, path))
    else:
        mitosis_train_rdd = sc.read.textFile(
            args.mitosis_img_csv).map(lambda path: (1, path))

    if args.normal_img_csv is None:
        fs = get_hdfs(args.hdfs_host, args.hdfs_port)
        normal_img_pathes = fs.ls(args.normal_img_dir)
        normal_label_img_pathes = [(0, path) for path in normal_img_pathes]
        #normal_train_rdd = sc.parallelize(normal_img_pathes).map(lambda path : (0, path))
    else:
        normal_train_rdd = sc.read.textFile(
            args.normal_img_csv).map(lambda path: (0, path))

    # get the train data set with mitosis and normal images. In the output RDD,
    # each entry will be (label, img_arr)
    training_data = []
    training_data.extend(mitosis_label_img_pathes)
    training_data.extend(normal_label_img_pathes)
    print("+++++++++++ Training data size: {}".format(len(training_data)))
    data_RDD = sc.parallelize(training_data) \
      .repartition(int(len(training_data)/128/2000)) \
      .mapPartitions(lambda iter : read_images(get_hdfs(args.hdfs_host, args.hdfs_port), iter))

    cluster = TFCluster.run(sc,
                            mitosis_dist.map_fun,
                            args,
                            args.cluster_size,
                            num_ps,
                            args.tensorboard,
                            TFCluster.InputMode.SPARK,
                            log_dir=args.model)

    if args.mode == "train":
        cluster.train(data_RDD, args.epochs)
    else:
        labelRDD = cluster.inference(data_RDD)
        labelRDD.saveAsTextFile(args.output)

    cluster.shutdown(grace_secs=30)

    print("{0} ===== Stop".format(datetime.now().isoformat()))
示例#36
0
    label = numpy.array(features['label'].int64_list.value)
    return (image, label)
  dataRDD = images.map(lambda x: toNumpy(str(x[0])))
else:
  if args.format == "csv": # HDFS==>numpy array
    images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')])
    labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')])
  else: # args.format == "pickle":  # HDFS==>numpy array
    images = sc.pickleFile(args.images)
    labels = sc.pickleFile(args.labels)

  print("zipping images and labels")
  # print(type(labels))
  # print(labels.count())
  dataRDD = images.zip(labels) # image+label

#cluster = TFCluster.reserve(sc, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK)
#cluster.start(mnist_dist.map_fun, args)
cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK)
if args.mode == "train" or args.mode == "retrain":
  cluster.train(dataRDD, args.epochs)
else:
  labelRDD = cluster.inference(dataRDD)
  labelRDD.saveAsTextFile(args.output)
cluster.shutdown()  # 集群关闭

print("{0} ===== Stop".format(datetime.now().isoformat()))



示例#37
0
    parser.add_argument("--epochs", help="number of epochs",
                        type=int, default=1)
    parser.add_argument(
        "--steps", help="maximum number of steps", type=int, default=1000)

    args=parser.parse_args()

    data_loader=TextLoader(
        sc, args.data_dir, args.batch_size, args.seq_length)

    args.vocab_size = data_loader.vocab_size

    defaultFS = sc._jsc.hadoopConfiguration().get("fs.defaultFS")
    working_dir = os.getcwd()

    config_file = TFNode.hdfs_path(os.path.join(args.save_dir, 'config.p'), defaultFS, working_dir)
    sc.parallelize([args]).saveAsPickleFile(config_file)

    chars_vocab_file = TFNode.hdfs_path(os.path.join(args.save_dir, 'chars_vocab.p'), defaultFS, working_dir)
    sc.parallelize([data_loader.chars, data_loader.vocab]).saveAsPickleFile(chars_vocab_file)

    dataRDD=sc.parallelize(data_loader.get_data_for_feeder())

    cluster=TFCluster.run(sc, main_fun, args, num_executors,
                            args.num_ps_tasks, TFCluster.InputMode.SPARK)

    cluster.train(dataRDD, args.epochs)

    cluster.shutdown()

    print("{0} ===== Stop".format(datetime.now().isoformat()))