from inception.imagenet_data import ImagenetData print("argv:", argv) sys.argv = argv FLAGS = tf.app.flags.FLAGS FLAGS._parse_flags() print("FLAGS:", FLAGS.__dict__['__flags']) dataset = ImagenetData(subset=FLAGS.subset) assert dataset.data_files() if tf.gfile.Exists(FLAGS.eval_dir): tf.gfile.DeleteRecursively(FLAGS.eval_dir) tf.gfile.MakeDirs(FLAGS.eval_dir) cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma) inception_eval.evaluate(dataset) if __name__ == '__main__': sc = SparkContext(conf=SparkConf().setAppName("grid_imagenet_eval")) num_executors = int(sc._conf.get("spark.executor.instances")) num_ps = 0 #cluster = TFCluster.reserve(sc, num_executors, num_ps, False, TFCluster.InputMode.TENSORFLOW) #cluster.start(main_fun, sys.argv) cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, False, TFCluster.InputMode.TENSORFLOW) cluster.shutdown()
parser.add_argument("-c", "--rdma", help="use rdma connection", default=False) args = parser.parse_args() print("args:", args) print("{0} ===== Start".format(datetime.now().isoformat())) def parse(ln): lbl, img = ln.split('|') image = [int(x) for x in img.split(',')] label = numpy.zeros(10) label[int(lbl)] = 1.0 return (image, label) stream = ssc.textFileStream(args.images) imageRDD = stream.map(lambda ln: parse(ln)) cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK) if args.mode == "train": cluster.train(imageRDD) else: labelRDD = cluster.inference(imageRDD) labelRDD.saveAsTextFiles(args.output) ssc.start() cluster.shutdown(ssc) print("{0} ===== Stop".format(datetime.now().isoformat()))
help="method to ingest data: (spark|tf)", choices=["spark", "tf"], default="tf") parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") (args, rem) = parser.parse_known_args() input_mode = TFCluster.InputMode.SPARK if args.input_mode == 'spark' else TFCluster.InputMode.TENSORFLOW print("{0} ===== Start".format(datetime.now().isoformat())) sc = SparkContext( conf=SparkConf().setAppName('imagenet_distributed_train')) num_executors = int(sc._conf.get("spark.executor.instances")) num_ps = 1 #cluster = TFCluster.reserve(sc, num_executors, num_ps, args.tensorboard, input_mode) #cluster.start(main_fun, sys.argv) cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, args.tensorboard, input_mode) if input_mode == TFCluster.InputMode.SPARK: dataRDD = sc.newAPIHadoopFile( args.input_data, "org.tensorflow.hadoop.io.TFRecordFileInputFormat", keyClass="org.apache.hadoop.io.BytesWritable", valueClass="org.apache.hadoop.io.NullWritable") cluster.train(dataRDD, args.epochs) cluster.shutdown() print("{0} ===== Stop".format(datetime.now().isoformat()))
from pyspark.context import SparkContext from pyspark.conf import SparkConf from com.yahoo.ml.tf import TFCluster, TFNode from datetime import datetime def main_fun(argv, ctx): """Main function entrance for spark. Make sure that all imports are done here, or spark will try to serialize libraries when they are placed outside for each executor, and we don't want that! ~WFU""" import tensorflow as tf if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() args, rem = parser.parse_known_args() sc = SparkContext(conf=SparkConf().setAppName("Nacho")) num_executors = int(sc._conf.get("spark.executor.instances")) num_processes = 1 use_tensorboard = False cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_processes, use_tensorboard, TFCluster.InputMode.TENSORFLOW) cluster.shutdown()
if __name__ == '__main__': import argparse sc = SparkContext(conf=SparkConf().setAppName("train_image_classifier")) executors = sc._conf.get("spark.executor.instances") num_executors = int(executors) if executors is not None else 1 parser = argparse.ArgumentParser() parser.add_argument("--num_ps_tasks", help="number of PS nodes", type=int, default=0) parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) (args, rem) = parser.parse_known_args() assert (num_executors > args.num_ps_tasks) #cluster = TFCluster.reserve(sc, args.cluster_size, args.num_ps_tasks, args.tensorboard, TFCluster.InputMode.TENSORFLOW) #cluster.start(main_fun, sys.argv) cluster = TFCluster.run(sc, main_fun, sys.argv, args.cluster_size, args.num_ps_tasks, args.tensorboard, TFCluster.InputMode.TENSORFLOW) cluster.shutdown()