def test_inputmode_spark(self): """Distributed TF cluster w/ InputMode.SPARK""" def _map_fun(args, ctx): import tensorflow as tf cluster, server = TFNode.start_cluster_server(ctx) if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % ctx.task_index, cluster=cluster)): x = tf.placeholder(tf.int32, [None, 1]) sq = tf.square(x) init_op = tf.global_variables_initializer() sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0), init_op=init_op) with sv.managed_session(server.target) as sess: tf_feed = TFNode.DataFeed(ctx.mgr, False) while not sv.should_stop() and not tf_feed.should_stop(): outputs = sess.run([sq], feed_dict={ x: tf_feed.next_batch(10) }) tf_feed.batch_results(outputs[0]) sv.stop() input = [ [x] for x in range(1000) ] # set up input as tensors of shape [1] to match placeholder rdd = self.sc.parallelize(input, 10) cluster = TFCluster.run(self.sc, _map_fun, tf_args={}, num_executors=self.num_workers, num_ps=0, input_mode=TFCluster.InputMode.SPARK) rdd_out = cluster.inference(rdd) rdd_sum = rdd_out.sum() self.assertEqual(sum( [x * x for x in range(1000)] ), rdd_sum) cluster.shutdown()
def test_basic_tf(self): """Single-node TF graph (w/ args) running independently on multiple executors.""" def _map_fun(args, ctx): import tensorflow as tf x = tf.constant(args['x']) y = tf.constant(args['y']) sum = tf.add(x,y) with tf.Session() as sess: result = sess.run([sum]) assert result[0] == 3 args = { 'x':1, 'y':2 } cluster = TFCluster.run(self.sc, _map_fun, tf_args=args, num_executors=self.num_workers, num_ps=0) cluster.shutdown()
classifier.export_saved_model(args.export_dir, serving_input_receiver_fn) if __name__ == "__main__": # tf.app.run() from pyspark.context import SparkContext from pyspark.conf import SparkConf from tensorflowonspark import TFCluster import argparse sc = SparkContext(conf=SparkConf().setAppName("mnist_estimator")) executors = sc._conf.get("spark.executor.instances") num_executors = int(executors) if executors is not None else 1 parser = argparse.ArgumentParser() parser.add_argument("--batch_size", help="number of records per batch", type=int, default=64) parser.add_argument("--buffer_size", help="size of shuffle buffer", type=int, default=10000) parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) parser.add_argument("--epochs", help="number of epochs", type=int, default=3) parser.add_argument("--learning_rate", help="learning rate", type=float, default=1e-4) parser.add_argument("--model_dir", help="path to save checkpoint", default="mnist_model") parser.add_argument("--export_dir", help="path to export saved_model", default="mnist_export") parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args = parser.parse_args() print("args:", args) cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, num_ps=0, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.TENSORFLOW, log_dir=args.model_dir, master_node='chief', eval_node=True) cluster.shutdown(grace_secs=120)
def main(args=None): spark = SparkSession \ .builder \ .appName("mitosis_spark") \ .getOrCreate() sc = spark.sparkContext executors = sc._conf.get("spark.executor.instances") num_executors = int(executors) if executors is not None else 1 num_ps = 1 logging.info("============= Num of executors: {0}".format(num_executors)) # parse args parser = argparse.ArgumentParser() parser.add_argument("--appName", default="mitosis_spark", help="application name") parser.add_argument("--hdfs_host", help="HDFS host", type=str, default="default") parser.add_argument("--hdfs_port", help="HDFS port", type=int, default=8020) parser.add_argument("--mitosis_img_dir", help="path to the mitosis image files") parser.add_argument( "--mitosis_img_csv", help="csv file that contain all the mitosis image files") parser.add_argument("--normal_img_dir", required=True, help="path to the normal image files") parser.add_argument( "--normal_img_csv", help="csv file that contain all the normal image files") parser.add_argument("--batch_size", help="number of records per batch", type=int, default=32) parser.add_argument("--epochs", help="number of epochs", type=int, default=1) parser.add_argument("--export_dir", help="HDFS path to export saved_model", default="mnist_export") parser.add_argument("--format", help="example format: (csv|pickle|tfr)", choices=["csv", "pickle", "tfr"], default="csv") parser.add_argument( "--model", help="HDFS path to save/load model during train/inference", default="mnist_model") parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) parser.add_argument("--output", help="HDFS path to save test/inference output", default="predictions") parser.add_argument("--readers", help="number of reader/enqueue threads", type=int, default=1) parser.add_argument("--steps", help="maximum number of steps", type=int, default=99) parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") parser.add_argument("--mode", help="train|inference", default="train") parser.add_argument("--rdma", help="use rdma connection", default=False) args = parser.parse_args(args) if args.mitosis_img_dir is None and args.mitosis_img_csv is None: parser.error( "at least one of --mitosis_img_dir and --mitosis_img_csv required") if args.normal_img_dir is None and args.normal_img_csv is None: parser.error( "at least one of --normal_img_dir and --normal_img_csv required") if args.mitosis_img_csv is None: fs = get_hdfs(args.hdfs_host, args.hdfs_port) mitosis_img_pathes = fs.ls(args.mitosis_img_dir) mitosis_label_img_pathes = [(1, path) for path in mitosis_img_pathes] #mitosis_train_rdd = sc.parallelize(mitosis_img_pathes).map(lambda path : (1, path)) else: mitosis_train_rdd = sc.read.textFile( args.mitosis_img_csv).map(lambda path: (1, path)) if args.normal_img_csv is None: fs = get_hdfs(args.hdfs_host, args.hdfs_port) normal_img_pathes = fs.ls(args.normal_img_dir) normal_label_img_pathes = [(0, path) for path in normal_img_pathes] #normal_train_rdd = sc.parallelize(normal_img_pathes).map(lambda path : (0, path)) else: normal_train_rdd = sc.read.textFile( args.normal_img_csv).map(lambda path: (0, path)) # get the train data set with mitosis and normal images. In the output RDD, # each entry will be (label, img_arr) training_data = [] training_data.extend(mitosis_label_img_pathes) training_data.extend(normal_label_img_pathes) print("+++++++++++ Training data size: {}".format(len(training_data))) data_RDD = sc.parallelize(training_data) \ .repartition(int(len(training_data)/128/2000)) \ .mapPartitions(lambda iter : read_images(get_hdfs(args.hdfs_host, args.hdfs_port), iter)) cluster = TFCluster.run(sc, mitosis_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK, log_dir=args.model) if args.mode == "train": cluster.train(data_RDD, args.epochs) else: labelRDD = cluster.inference(data_RDD) labelRDD.saveAsTextFile(args.output) cluster.shutdown(grace_secs=30) print("{0} ===== Stop".format(datetime.now().isoformat()))
parser.add_argument("--readers", help="number of reader/enqueue threads", type=int, default=1) parser.add_argument("--shuffle_size", help="size of shuffle buffer", type=int, default=1000) parser.add_argument("--steps", help="maximum number of steps", type=int, default=1000) parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args = parser.parse_args() print("args:", args) print("{0} ===== Start".format(datetime.now().isoformat())) cluster = TFCluster.run(sc, mnist_dist_dataset.map_fun, args, args.cluster_size, args.num_ps, args.tensorboard, TFCluster.InputMode.TENSORFLOW, driver_ps_nodes=args.driver_ps_nodes) cluster.shutdown() print("{0} ===== Stop".format(datetime.now().isoformat()))
train_df = read_train_data(hiveContext, label_name, args , feature_alias) # 提取特征类别数 label_type , type_count = extract_label_species(train_df,label_name) args.label_count = type_count # 进行 one-hot 编码 dataRDD = train_df.rdd.map(label_one_hot) trainRDD , testRDD = dataRDD.randomSplit([1 - float(args.sample_ratio), float(args.sample_ratio)],seed=args.seed) # 构建tensorflow on spark集群 cluster = TFCluster.run(sc, softmax_dist.map_fun, args, args.cluster_size, # 集群节点个数 num_ps, args.tensorboard, TFCluster.InputMode.SPARK, log_dir=args.model) print("{0} ===== Train Start".format(datetime.now().isoformat())) # 模型训练 cluster.train(trainRDD, args.epochs) # 关闭集群 cluster.shutdown(grace_secs=30) # 采用graceful方式关闭tensorflow on spark集群 print("{0} ===== Train Stop".format(datetime.now().isoformat())) # 构建tensorflow on spark集群 args.mode = "inference" cluster = TFCluster.run(sc, softmax_dist.map_fun, args,
import mnist_dist sc = SparkContext(conf=SparkConf().setAppName("mnist_tf")) executors = sc._conf.get("spark.executor.instances") num_executors = int(executors) if executors is not None else 1 num_ps = 1 parser = argparse.ArgumentParser() parser.add_argument("-e", "--epochs", help="number of epochs", type=int, default=0) parser.add_argument("-f", "--format", help="example format: (csv|pickle|tfr)", choices=["csv","pickle","tfr"], default="tfr") parser.add_argument("-i", "--images", help="HDFS path to MNIST images in parallelized format") parser.add_argument("-l", "--labels", help="HDFS path to MNIST labels in parallelized format") parser.add_argument("-m", "--model", help="HDFS path to save/load model during train/test", default="mnist_model") parser.add_argument("-n", "--cluster_size", help="number of nodes in the cluster (for Spark Standalone)", type=int, default=num_executors) parser.add_argument("-o", "--output", help="HDFS path to save test/inference output", default="predictions") parser.add_argument("-r", "--readers", help="number of reader/enqueue threads", type=int, default=1) parser.add_argument("-s", "--steps", help="maximum number of steps", type=int, default=1000) parser.add_argument("-tb", "--tensorboard", help="launch tensorboard process", action="store_true") parser.add_argument("-X", "--mode", help="train|inference", default="train") parser.add_argument("-c", "--rdma", help="use rdma connection", default=False) args = parser.parse_args() print("args:",args) print("{0} ===== Start".format(datetime.now().isoformat())) cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.TENSORFLOW) cluster.shutdown() print("{0} ===== Stop".format(datetime.now().isoformat()))
duration = time.time() - self._start_time loss_value = run_values.results if self._step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook()], config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op) if __name__ == '__main__': sc = SparkContext(conf=SparkConf().setAppName("cifar10_train")) num_executors = int(sc._conf.get("spark.executor.instances")) num_ps = 0 cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, False, TFCluster.InputMode.TENSORFLOW) cluster.shutdown()
parser.add_argument("-o", "--output", help="HDFS path to save test/inference output", default="predictions") parser.add_argument("-r", "--readers", help="number of reader/enqueue threads", type=int, default=1) parser.add_argument("-s", "--steps", help="maximum number of steps", type=int, default=1000) parser.add_argument("-tb", "--tensorboard", help="launch tensorboard process", action="store_true") parser.add_argument("-X", "--mode", help="train|inference", default="train") parser.add_argument("-c", "--rdma", help="use rdma connection", default=False) args = parser.parse_args() print("args:", args) print("{0} ===== Start".format(datetime.now().isoformat())) cluster = TFCluster.run(sc, mnist_dist_dataset.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.TENSORFLOW) cluster.shutdown() print("{0} ===== Stop".format(datetime.now().isoformat()))
import resnet_cifar_dist if __name__ == '__main__': # tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) # absl_app.run(main) from pyspark.context import SparkContext from pyspark.conf import SparkConf from tensorflowonspark import TFCluster import argparse sc = SparkContext(conf=SparkConf().setAppName("resnet_cifar")) executors = sc._conf.get("spark.executor.instances") num_executors = int(executors) if executors is not None else 1 parser = argparse.ArgumentParser() parser.add_argument("--cluster_size", help="number of nodes in the cluster (for Spark Standalone)", type=int, default=num_executors) parser.add_argument("--num_ps", help="number of parameter servers", type=int, default=0) parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args, rem = parser.parse_known_args() cluster = TFCluster.run(sc, resnet_cifar_dist.main_fun, rem, args.cluster_size, args.num_ps, args.tensorboard, TFCluster.InputMode.TENSORFLOW, master_node='chief') cluster.shutdown()
train_tensor, logdir=FLAGS.train_dir, master=server.target, is_chief=(FLAGS.task == 0), init_fn=_get_init_fn(), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, summary_writer=summary_writer, sync_optimizer=optimizer if FLAGS.sync_replicas else None) if __name__ == '__main__': import argparse sc = SparkContext(conf=SparkConf().setAppName("train_image_classifier")) executors = sc._conf.get("spark.executor.instances") num_executors = int(executors) if executors is not None else 1 parser = argparse.ArgumentParser() parser.add_argument("--num_ps_tasks", help="number of PS nodes", type=int, default=0) parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) (args,rem) = parser.parse_known_args() assert(num_executors > args.num_ps_tasks) cluster = TFCluster.run(sc, main_fun, sys.argv, args.cluster_size, args.num_ps_tasks, args.tensorboard, TFCluster.InputMode.TENSORFLOW) cluster.shutdown()
help="HDFS path to save/load model during train/inference", default="mnist_model") parser.add_argument("--output", help="HDFS path to save test/inference output", default="predictions") parser.add_argument("--num_ps", help="number of PS nodes in cluster", type=int, default=1) parser.add_argument("--steps", help="maximum number of steps", type=int, default=1000) parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args = parser.parse_args() print("args:", args) cluster = TFCluster.run(sc, main, args, args.cluster_size, args.num_ps, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.TENSORFLOW, log_dir=args.model, master_node='master') cluster.shutdown()
self.epochs = conf["NN.epochs"] self.cluster_size = conf["NN.cluster_size"] self.steps = conf["NN.steps"] args = argsClass() print("{0} ===== Start".format(datetime.now().isoformat())) if args.mode == "train" or args.mode == "inference": datafile = sqlContext.read.format("parquet").load(args.input_file) labelRDD = datafile.rdd.map(lambda row: row._2._1) featureRDD = datafile.rdd.map(lambda row: row._2._2) dataRDD = featureRDD.zip(labelRDD) else: datafile = sqlContext.read.format("parquet").load(args.input_file) sha256RDD = datafile.rdd.map(lambda row: row._1) featureRDD = datafile.rdd.map(lambda row: row._2._2) dataRDD = featureRDD.zip(sha256RDD) cluster = TFCluster.run(sc, NN_dist.map_fun, args, args.cluster_size, num_ps, False, TFCluster.InputMode.SPARK) if args.mode == "train": cluster.train(dataRDD, args.epochs) else: labelRDD = cluster.inference(dataRDD) labelRDD.saveAsTextFile(args.output) cluster.shutdown() print("{0} ===== Stop".format(datetime.now().isoformat()))
sep=",", inferSchema=True, header=False) else: print('Wrong value for parameter mode. Allowed: all, nlp or train') quit() if (args.mode == 'all') or (args.mode == 'train'): # Cache the dataset before training train_rdd = train_df.rdd.cache() # Train the model in the Spark cluster print('{} Starting model training'.format(datetime.now())) num_ps = 1 cluster = TFCluster.run(sc, main_fun, args, num_executors, num_ps, args.tensorboard, TFCluster.InputMode.SPARK, log_dir=args.model_folder, master_node='master', reservation_timeout=60) cluster.train(train_rdd, args.epochs, feed_timeout=4800) print('{} End model training'.format(datetime.now())) cluster.shutdown() print('{} End program'.format(datetime.now()))
help="number of ps nodes", type=int, default=1) parser.add_argument("--task_num", help="number of worker nodes", type=int, default=1) parser.add_argument("--max_steps", help="max number of steps to train", type=int, default=2000000) parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args = parser.parse_args() print("args:", args) assert (args.num_ps + args.task_num == num_executors) cluster = TFCluster.run(sc, main_func, args, args.cluster_size, args.num_ps, args.tensorboard, TFCluster.InputMode.TENSORFLOW, log_dir=args.model_dir, master_node='master') cluster.shutdown()
# arguments for Spark and TFoS parser = argparse.ArgumentParser() parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=executors) parser.add_argument("--num_ps", help="number of ps nodes", type=int, default=1) (args, remainder) = parser.parse_known_args() # construct an ARGV (with script name as first element) from remaining args and pass it to the TF processes on executors remainder.insert(0, __file__) print("spark args:", args) print("tf args:", remainder) num_workers = args.cluster_size - args.num_ps print("===== num_executors={}, num_workers={}, num_ps={}".format( args.cluster_size, num_workers, args.num_ps)) cluster = TFCluster.run(sc, main_fun, remainder, args.cluster_size, args.num_ps, False, TFCluster.InputMode.TENSORFLOW, master_node='master') cluster.shutdown()
def unit2(): from pyspark.context import SparkContext from pyspark.conf import SparkConf import argparse import os import numpy import sys import tensorflow as tf import threading from datetime import datetime from hops import util from hops import hdfs from tensorflowonspark import TFCluster sc = spark.sparkContext num_executors = util.num_executors(spark) num_ps = util.num_param_servers(spark) parser = argparse.ArgumentParser() parser.add_argument("-e", "--epochs", help="number of epochs", type=int, default=0) parser.add_argument("-f", "--format", help="example format: (csv|pickle|tfr)", choices=["csv", "pickle", "tfr"], default="csv") parser.add_argument( "-i", "--images", help="HDFS path to MNIST images in parallelized format", default='/Projects/' + hdfs.project_name() + '/mnist/train/images') parser.add_argument( "-l", "--labels", help="HDFS path to MNIST labels in parallelized format", default='/Projects/' + hdfs.project_name() + '/mnist/train/labels') parser.add_argument("-m", "--model", help="HDFS path to save/load model during train/test", default="mnist_model") parser.add_argument( "-n", "--cluster_size", help="number of nodes in the cluster (for Spark Standalone)", type=int, default=num_executors) parser.add_argument("-o", "--output", help="HDFS path to save test/inference output", default="predictions") parser.add_argument("-r", "--readers", help="number of reader/enqueue threads", type=int, default=1) parser.add_argument("-s", "--steps", help="maximum number of steps", type=int, default=1000) parser.add_argument("-tb", "--tensorboard", help="launch tensorboard process", action="store_true") parser.add_argument("-X", "--mode", help="train|inference", default="train") parser.add_argument("-c", "--rdma", help="use rdma connection", default=False) args = parser.parse_args() print("args:", args) print("{0} ===== Start".format(datetime.now().isoformat())) cluster = TFCluster.run(sc, mnist_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.TENSORFLOW) cluster.shutdown() print("{0} ===== Stop".format(datetime.now().isoformat()))
args.cluster_size, num_workers, args.num_ps)) sys.argv = args #generate type_constrain.txt file n_n() if args.mode == 'train': if is_new_batch(): feed_batch() try: os.remove(os.path.join(args.output_path, "stop.txt")) except: pass if args.debug: print("Launching jobs...") elapsed_time = time.time() cluster = TFCluster.run(sc, distribute_training.main_fun, args, args.cluster_size, args.num_ps, True, TFCluster.InputMode.TENSORFLOW) cluster.shutdown(timeout=-1) elapsed_time = time.time() - elapsed_time with open(os.path.join(args.output_path, 'time.txt'), 'w') as f: f.write("Elapsed time: " + str(elapsed_time) + "\n") if args.mode == 'train': if is_new_batch(): remove_batch_files() if args.debug: print("Restoring the best model founded during training...") if path.exists(os.path.join(args.output_path, "stop.txt")): step = None with open(os.path.join(args.output_path, "stop.txt"), "r") as f: step = int(f.readline().strip())
"--tensorboardlogdir", help= "Tensorboard log directory. It should on hdfs. Thus, it must be prefixed with hdfs://default" ) args = parser.parse_args() print("args:", args) print("{0} ===== Start".format(datetime.now().isoformat())) dataRDD = sc.textFile( args.data).map(lambda ln: [x for x in ln.split('\t')]) cluster = TFCluster.run(sc, criteo_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK, log_dir=args.model) if args.mode == "train": cluster.train(dataRDD, args.epochs) else: labelRDD = cluster.inference(dataRDD) labelRDD.saveAsTextFile(args.output) cluster.shutdown() print("{0} ===== Stop".format(datetime.now().isoformat()))
"--num_layers", help="Number of LSTM hidden layers.", type=int, default=2) parser.add_argument("-hu", "--hidden_units", help="Number of units in LSTM hidden layer.", type=int, default=128) args = parser.parse_args() redis_logger_handler.logging_setup(args.redis) logging.info("===== Start") images, labels = parseFile(args.images, args.labels, args.format) dataRDD = images.zip(labels) args.train_size = labels.count() - args.test_size logging.info(args) cluster = TFCluster.run(sc, lstm_ctc_ocr_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK) if args.mode == "train": cluster.train(dataRDD, args.epochs) else: labelRDD = cluster.inference(dataRDD) labelRDD.saveAsTextFile(args.output) cluster.shutdown() logger.info("===== Stop")
return audio # Trial data paths paths = 'data/local/bird,data/local/bed,data/local/cat' # Loading data into memory baseAudio = sc.binaryFiles(paths) parser = argparse.ArgumentParser() parser.add_argument("--batch_size", help="number of records per batch", type=int, default=1) parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=1) # parser.add_argument("--epochs", help="number of epochs", type=int, default=3) # parser.add_argument("--images_labels", help="path to MNIST images and labels in parallelized format") # parser.add_argument("--model_dir", help="path to save checkpoint", default="mnist_model") parser.add_argument("--export_dir", help="path to export saved_model", default="speechModel") # parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args = parser.parse_args() # Applying Transformations To Data convertedAndLabeledAudio = baseAudio.map(lambda x: [carveClassName(x[0]), binaryToNumerical(x[1])]) transformedAudio = convertedAndLabeledAudio.map(lambda x: [x[0], fourierTransformation(x[1])]) # Defining Cluster cluster = TFCluster.run(sc, mainFun, args, num_ps = 0, num_executors = 1, tensorboard = True, input_mode = TFCluster.InputMode.SPARK, master_node='chief') # Training on cluster cluster.train(transformedAudio, num_epochs=3) # Shutting down cluster after training is complete cluster.shutdown()
# hooks = [tf.train.StopAtStepHook( # last_step=int(int(self.dataset_size * (1 - self.test_percent) * n_epochs / self.batch_size)))]) as sess: if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--n_epoch", help="number of current epoch", type=int) parser.add_argument("--main_path", help="Path to '../CatDog-CNN-Tensorflow-OnSpark'", required=True, type=str) parser.add_argument("--dataset_size", help="Training size to use", type=str) parser.add_argument("--batch_size", help="batch size to use", type=int) args = parser.parse_args() sys.path.append(args.main_path + "/CatDog-CNN-Tensorflow-OnSpark") import conv_net sc = SparkContext(conf=SparkConf().setAppName("catdog_spark")) num_executors = 1 num_ps = 0 cluster = TFCluster.run(sc, main_fun, args, num_executors, num_ps, False, TFCluster.InputMode.TENSORFLOW) cluster.shutdown()
plt.xlabel('Epoch') plt.ylabel('Accuracy') plt.title('Accuracy per epoch train vs test') plt.legend() plt.grid(True) plt.show() plt.plot(train_cost[zoom_point:]) plt.plot(test_cost[zoom_point:]) plt.xlabel('Epoch') plt.ylabel('Loss') plt.title('Loss per epoch train vs test') plt.legend() plt.grid(True) plt.show() if __name__ == '__main__': # tf.app.run() import argparse parser = argparse.ArgumentParser() parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args, rem = parser.parse_known_args() sc = SparkContext(conf=SparkConf().setAppName("lab4_task6")) #num_executors = int(sc._conf.get("spark.executor.instances")) num_ps = 1 tensorboard = True cluster = TFCluster.run(sc, main_fun, [], 2, num_ps, tensorboard, TFCluster.InputMode.TENSORFLOW) cluster.shutdown()
# preds = estimator.predict(input_fn=test_input_fn) # for pred in preds: # print(pred) from pyspark.ml.linalg import Vectors if __name__ == '__main__': from tensorflowonspark import TFCluster executors = sc._conf.get("spark.executor.instances") num_executors = int(executors) if executors is not None else 2 num_ps = 1 args = edict({ "cluster_size": num_executors, "num_ps": num_ps, "tensorboard": False, "model_dir": "/spark/data", "epochs": 1, "steps": 2000 }) # iris RDD iris = datasets.load_iris() df = spark.createDataFrame([(int(target), Vectors.dense(data)) for target, data in zip(iris.target, iris.data)], ['label', 'features']) rdd = df.select('features', 'label').rdd.map(tuple) cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, args.num_ps, args.tensorboard, TFCluster.InputMode.SPARK, master_node='master') cluster.train(rdd, args.epochs) cluster.shutdown()
# TODO(sguada) use num_epochs=1 if FLAGS.max_num_batches: num_batches = FLAGS.max_num_batches else: # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size)) if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Evaluating %s' % checkpoint_path) slim.evaluation.evaluate_once( master=FLAGS.master, checkpoint_path=checkpoint_path, logdir=FLAGS.eval_dir, num_evals=num_batches, eval_op=list(names_to_updates.values()), variables_to_restore=variables_to_restore) if __name__ == '__main__': sc = SparkContext(conf=SparkConf().setAppName("eval_image_classifier")) num_executors = int(sc._conf.get("spark.executor.instances")) cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, 0, False, TFCluster.InputMode.TENSORFLOW) cluster.shutdown()
parser.add_argument("--input_data", help="HDFS path to input dataset") parser.add_argument("--input_mode", help="method to ingest data: (spark|tf)", choices=["spark", "tf"], default="tf") parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") (args, rem) = parser.parse_known_args() input_mode = TFCluster.InputMode.SPARK if args.input_mode == 'spark' else TFCluster.InputMode.TENSORFLOW print("{0} ===== Start".format(datetime.now().isoformat())) sc = SparkContext( conf=SparkConf().setAppName('imagenet_distributed_train')) num_executors = int(sc._conf.get("spark.executor.instances")) num_ps = 1 cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, args.tensorboard, input_mode) if input_mode == TFCluster.InputMode.SPARK: dataRDD = sc.newAPIHadoopFile( args.input_data, "org.tensorflow.hadoop.io.TFRecordFileInputFormat", keyClass="org.apache.hadoop.io.BytesWritable", valueClass="org.apache.hadoop.io.NullWritable") cluster.train(dataRDD, args.epochs) cluster.shutdown() print("{0} ===== Stop".format(datetime.now().isoformat()))
sc = SparkContext(conf=SparkConf().setAppName("mnist_tf")) executors = sc._conf.get("spark.executor.instances") num_executors = int(executors) if executors is not None else 1 num_ps = 1 parser = argparse.ArgumentParser() parser.add_argument("-e", "--epochs", help="number of epochs", type=int, default=0) parser.add_argument("-f", "--format", help="example format: (csv|pickle|tfr)", choices=["csv","pickle","tfr"], default="tfr") parser.add_argument("-i", "--images", help="HDFS path to MNIST images in parallelized format") parser.add_argument("-l", "--labels", help="HDFS path to MNIST labels in parallelized format") parser.add_argument("-m", "--model", help="HDFS path to save/load model during train/test", default="mnist_model") parser.add_argument("-n", "--cluster_size", help="number of nodes in the cluster (for Spark Standalone)", type=int, default=num_executors) parser.add_argument("-o", "--output", help="HDFS path to save test/inference output", default="predictions") parser.add_argument("-r", "--readers", help="number of reader/enqueue threads", type=int, default=1) parser.add_argument("-s", "--steps", help="maximum number of steps", type=int, default=1000) parser.add_argument("-tb", "--tensorboard", help="launch tensorboard process", action="store_true") parser.add_argument("-X", "--mode", help="train|inference", default="train") parser.add_argument("-c", "--rdma", help="use rdma connection", default=False) parser.add_argument("-p", "--driver_ps_nodes", help="run tensorflow PS node on driver locally", default=False) args = parser.parse_args() print("args:",args) print("{0} ===== Start".format(datetime.now().isoformat())) cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.TENSORFLOW, driver_ps_nodes=args.driver_ps_nodes, log_dir=args.model) cluster.shutdown() print("{0} ===== Stop".format(datetime.now().isoformat()))
default="mnist_model") parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args = parser.parse_args() print("args:", args) # create RDD of input data def parse(ln): vec = [int(x) for x in ln.split(',')] return (vec[1:], vec[0]) stream = ssc.textFileStream(args.images_labels) images_labels = stream.map(parse) cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, num_ps=1, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.SPARK, log_dir=args.model_dir, master_node='chief') cluster.train( images_labels, feed_timeout=86400 ) # extend feed timeout to 24hrs for streaming data to arrive ssc.start() cluster.shutdown(ssc)
while True: eval_once(saver, summary_writer, top_k_op, summary_op) if FLAGS.run_once: break time.sleep(FLAGS.eval_interval_secs) #cifar10.maybe_download_and_extract() if tf.gfile.Exists(FLAGS.eval_dir): tf.gfile.DeleteRecursively(FLAGS.eval_dir) tf.gfile.MakeDirs(FLAGS.eval_dir) evaluate() if __name__ == '__main__': sc = SparkContext(conf=SparkConf().setAppName("cifar10_eval")) executors = sc._conf.get("spark.executor.instances") num_executors = int(executors) if executors is not None else 2 num_ps = 0 parser = argparse.ArgumentParser() parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) args, unknown = parser.parse_known_args() cluster = TFCluster.run(sc, main_fun, sys.argv, args.cluster_size, num_ps, False, TFCluster.InputMode.TENSORFLOW) cluster.shutdown()
plt.xlabel('Epoch') plt.ylabel('Accuracy') plt.title('Accuracy per epoch train vs test') plt.legend() plt.grid(True) plt.show() plt.plot(train_cost[zoom_point:]) plt.plot(test_cost[zoom_point:]) plt.xlabel('Epoch') plt.ylabel('Loss') plt.title('Loss per epoch train vs test') plt.legend() plt.grid(True) plt.show() if __name__ == '__main__': # tf.app.run() import argparse parser = argparse.ArgumentParser() parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args, rem = parser.parse_known_args() sc = SparkContext(conf=SparkConf().setAppName("your_app_name")) num_executors = int(sc._conf.get("spark.executor.instances")) num_ps = 1 tensorboard = True cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, tensorboard, TFCluster.InputMode.TENSORFLOW) cluster.shutdown()
if __name__ == '__main__': sc = SparkContext(conf=SparkConf().setAppName("read hdfs save to hdfs ")) executors = sc._conf.get("spark.executor.instances") num_executors = int(executors) if executors is not None else 1 num_ps = 1 parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="input hdfs path") parser.add_argument("-m", "--model", help="HDFS path to save/load model during train/inference", default="mnist_model") parser.add_argument("-tb", "--tensorboard", help="launch tensorboard process", default=False) parser.add_argument("-b", "--batch_size", help="number of records per batch", type=int, default=100) parser.add_argument("-e", "--epochs", help="number of epochs", type=int, default=1) parser.add_argument("-s", "--steps", help="maximum number of steps", type=int, default=1000) parser.add_argument("-X", "--mode", help="train|inference", default="train") parser.add_argument("-c", "--rdma", help="use rdma connection", default=False) args = parser.parse_args() print("args:", args) # read data input_data = sc.textFile(args.input).map(lambda ln: [float(x) for x in ln.split(',')]) cluster = TFCluster.run(sc, map_fun, args, num_executors, num_ps, args.tensorboard, TFCluster.InputMode.SPARK) cluster.train(input_data, 1) cluster.shutdown()
parser.add_argument("--epochs", help="number of epochs", type=int, default=1) parser.add_argument( "--steps", help="maximum number of steps", type=int, default=1000) args=parser.parse_args() data_loader=TextLoader( sc, args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size defaultFS = sc._jsc.hadoopConfiguration().get("fs.defaultFS") working_dir = os.getcwd() config_file = TFNode.hdfs_path(os.path.join(args.save_dir, 'config.p'), defaultFS, working_dir) sc.parallelize([args]).saveAsPickleFile(config_file) chars_vocab_file = TFNode.hdfs_path(os.path.join(args.save_dir, 'chars_vocab.p'), defaultFS, working_dir) sc.parallelize([data_loader.chars, data_loader.vocab]).saveAsPickleFile(chars_vocab_file) dataRDD=sc.parallelize(data_loader.get_data_for_feeder()) cluster=TFCluster.run(sc, main_fun, args, num_executors, args.num_ps_tasks, TFCluster.InputMode.SPARK) cluster.train(dataRDD, args.epochs) cluster.shutdown() print("{0} ===== Stop".format(datetime.now().isoformat()))
for epoch in range(FLAGS.num_epoch): train_batches = train_reader.yieldBatches() print("Epoch: %d" % epoch) step = 0 for dense_x,sparse_idx,sparse_values,y in train_batches: start_time = datetime.now() _ ,train_loss,train_auc,summ,_ = sess.run([train_op,loss,auc_op,summary_op,step_update_op], feed_dict={dense_inputs:dense_x,sparse_inputs:(sparse_idx,sparse_values,shape),labels:y}) step += 1 assert not np.isnan(train_loss), 'Model diverged with loss = NaN' time_used = datetime.now() - start_time if step % FLAGS.display_step == 0: g_step, = sess.run([global_step]) print("step: " + str(step) + ", global_step: " + str(g_step)) summary_writer.add_summary(summ,g_step) print("Step = {}, Examples = {}, Time = {}, Minibatch Loss = {}, Auc = {}".format( g_step, g_step*FLAGS.batch_size, time_used, train_loss, train_auc)) sys.stdout.flush() total_time = datetime.now() - begin_time print("Training Done!!") print("Total time used: {}".format(total_time)) if __name__ == "__main__": sc = SparkContext(conf=SparkConf().setAppName("tfos_online_train_distributed")) num_executors = int(sc._conf.get("spark.executor.instances")) num_ps = 64 tensorboard = False cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, tensorboard, TFCluster.InputMode.TENSORFLOW) cluster.shutdown()
parser.add_argument("--epochs", help="number of epochs", type=int, default=3) parser.add_argument("--model_dir", help="path to save model/checkpoint", default="mnist_model") parser.add_argument("--export_dir", help="path to export saved_model", default="mnist_export") parser.add_argument("--steps_per_epoch", help="number of steps per epoch", type=int, default=469) parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args = parser.parse_args() print("args:", args) cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, num_ps=0, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.TENSORFLOW, master_node='chief') cluster.shutdown()
parser.add_argument("--steps_per_epoch", help="number of steps per epoch", type=int, default=300) parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args = parser.parse_args() print("args:", args) if args.input_mode == 'tf': cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, args.num_ps, args.tensorboard, TFCluster.InputMode.TENSORFLOW, log_dir=args.model_dir) else: # args.input_mode == 'spark': cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, args.num_ps, args.tensorboard, TFCluster.InputMode.SPARK, log_dir=args.model_dir) images = sc.textFile( args.images).map(lambda ln: [float(x) for x in ln.split(',')]) labels = sc.textFile(
if args.format == "tfr": images = sc.newAPIHadoopFile(args.images, "org.tensorflow.hadoop.io.TFRecordFileInputFormat", keyClass="org.apache.hadoop.io.BytesWritable", valueClass="org.apache.hadoop.io.NullWritable") def toNumpy(bytestr): example = tf.train.Example() example.ParseFromString(bytestr) features = example.features.feature image = numpy.array(features['image'].int64_list.value) label = numpy.array(features['label'].int64_list.value) return (image, label) dataRDD = images.map(lambda x: toNumpy(str(x[0]))) else: if args.format == "csv": images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')]) labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')]) else: # args.format == "pickle": images = sc.pickleFile(args.images) labels = sc.pickleFile(args.labels) print("zipping images and labels") dataRDD = images.zip(labels) cluster = TFCluster.run(sc, mnist_dist2.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK) if args.mode == "train": cluster.train(dataRDD, args.epochs) else: labelRDD = cluster.inference(dataRDD) labelRDD.saveAsTextFile(args.output) cluster.shutdown() print("{0} ===== Stop".format(datetime.now().isoformat()))
if not tf.gfile.Exists(FLAGS.train_dir): tf.gfile.MakeDirs(FLAGS.train_dir) inception_distributed_train.train(server.target, dataset, cluster_spec, ctx) if __name__ == '__main__': # parse arguments needed by the Spark driver import argparse parser = argparse.ArgumentParser() parser.add_argument("--epochs", help="number of epochs", type=int, default=0) parser.add_argument("--input_data", help="HDFS path to input dataset") parser.add_argument("--input_mode", help="method to ingest data: (spark|tf)", choices=["spark","tf"], default="tf") parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") (args,rem) = parser.parse_known_args() input_mode = TFCluster.InputMode.SPARK if args.input_mode == 'spark' else TFCluster.InputMode.TENSORFLOW print("{0} ===== Start".format(datetime.now().isoformat())) sc = SparkContext(conf=SparkConf().setAppName('imagenet_distributed_train')) num_executors = int(sc._conf.get("spark.executor.instances")) num_ps = 1 cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, args.tensorboard, input_mode) if input_mode == TFCluster.InputMode.SPARK: dataRDD = sc.newAPIHadoopFile(args.input_data, "org.tensorflow.hadoop.io.TFRecordFileInputFormat", keyClass="org.apache.hadoop.io.BytesWritable", valueClass="org.apache.hadoop.io.NullWritable") cluster.train(dataRDD, args.epochs) cluster.shutdown() print("{0} ===== Stop".format(datetime.now().isoformat()))