def main(_): autodist = AutoDist(resource_spec_file, AllReduce(128)) TRUE_W = 3.0 TRUE_b = 2.0 NUM_EXAMPLES = 1000 EPOCHS = 10 inputs = np.random.randn(NUM_EXAMPLES) noises = np.random.randn(NUM_EXAMPLES) outputs = inputs * TRUE_W + TRUE_b + noises class MyIterator: def initialize(self): return tf.zeros(1) def get_next(self): # a fake one return inputs inputs_iterator = MyIterator() print('I am going to a scope.') with tf.Graph().as_default() as g, autodist.scope(): # x = placeholder(shape=[NUM_EXAMPLES], dtype=tf.float32) W = tf.Variable(5.0, name='W', dtype=tf.float64) b = tf.Variable(0.0, name='b', dtype=tf.float64) def train_step(input): def y(x): return W * x + b def l(predicted_y, desired_y): return tf.reduce_mean(tf.square(predicted_y - desired_y)) major_version, _, _ = tf.version.VERSION.split('.') if major_version == '1': optimizer = tf.train.GradientDescentOptimizer(0.01) else: optimizer = tf.optimizers.SGD(0.01) with tf.GradientTape() as tape: loss = l(y(input), outputs) vs = [W, b] # gradients = tape.gradient(target=loss, sources=vs) gradients = tf.gradients(loss, vs) train_op = optimizer.apply_gradients(zip(gradients, vs)) return loss, train_op, b fetches = train_step(inputs_iterator.get_next()) session = autodist.create_distributed_session() for epoch in range(EPOCHS): l, t, b = session.run(fetches) print('node: {}, loss: {}\nb:{}'.format( autodist._cluster.get_local_address(), l, b)) print('I am out of scope')
def train_main(args): autodist = AutoDist(resource_spec_file, Parallaxx()) with open(os.path.join(args.path, "meta.yml"), 'rb') as f: meta = yaml.load(f.read(), Loader=yaml.FullLoader) hidden_layer_size = args.hidden_size num_epoch = args.num_epoch rank = ad.get_worker_communicate().rank() device_id = rank % args.num_local_worker nrank = ad.get_worker_communicate().nrank() distributed.ps_init(rank, nrank) ngraph = meta["partition"]["nodes"][rank] // args.batch_size graphs = prepare_data(ngraph) idx, epoch, nnodes = 0, 0, 0 graph_len = graphs[0][0].y.shape[0] with tf.Graph().as_default() as g, autodist.scope(): norm_adj = tf.compat.v1.sparse.placeholder(tf.float32, name="norm_adj") sparse_feature = tf.placeholder(tf.int32, [graph_len, meta["feature"] - 1]) y_ = tf.placeholder(tf.int32, [graph_len], name="y_") train_mask = tf.placeholder(tf.float32, [graph_len], name="train_mask") loss, y, train_op = model(norm_adj, sparse_feature, y_, train_mask) sess = autodist.create_distributed_session() acc_stat = [] start = time.time() while True: g_sample, mp_val, mask, mask_eval = graphs[idx] idx = (idx + 1) % ngraph feed_dict = { norm_adj: mp_val, sparse_feature: g_sample.x[:, 0:-1], y_: g_sample.y, train_mask: mask } print("Before training") loss_val = sess.run([loss, y, y_, train_op], feed_dict=feed_dict) print(loss_val) pred_val = loss_val[1] true_val = loss_val[2] acc_val = np.equal(np.argmax(pred_val, 1), true_val).astype(np.float) acc_stat.append(acc_val) nnodes += mask.sum() + mask_eval.sum() if nnodes > meta["partition"]["nodes"][rank]: nnodes = 0 epoch += 1 print("Acc : ", np.mean(acc_stat), "Time : ", time.time() - start) start = time.time() acc_stat = [] if epoch >= num_epoch: break
def train_and_save(): resource_spec_file = os.path.join(os.path.dirname(__file__), 'resource_spec.yml') autodist = AutoDist(resource_spec_file, PartitionedPS()) (train_data, train_labels), (test_data, test_labels) = tf.keras.datasets.imdb.load_data( num_words=vocab_size) train_data = tf.keras.preprocessing.sequence.pad_sequences(train_data, value=0, padding='post', maxlen=256) test_data = tf.keras.preprocessing.sequence.pad_sequences(test_data, value=0, padding='post', maxlen=256) train_labels = train_labels.astype(np.float32) with tf.Graph().as_default(), autodist.scope(): # AutoDist code my_iterator = tf.compat.v1.data.Dataset.from_tensor_slices((train_data, train_labels)) \ .shuffle(25000).batch(batch_size).repeat().make_one_shot_iterator().get_next() # my_iterator = MyIterator().get_next() model = SimpleModel() prev_time = time.time() # fetch train_op and loss loss_fn, train_op, gradients = model.train_fn(my_iterator) saver = autodist_saver() sess = autodist.create_distributed_session() for local_step in range(max_steps): loss, _ = sess.run(fetches=[loss_fn, train_op], options=config_pb2.RunOptions( trace_level=config_pb2.RunOptions.NO_TRACE)) if local_step % log_frequency == 0: cur_time = time.time() elapsed_time = cur_time - prev_time num_sentences = batch_size * log_frequency wps = float(num_sentences) / elapsed_time print( "Iteration %d, time = %.2fs, wps = %.0f, train loss = %.4f" % (local_step, cur_time - prev_time, wps, loss)) prev_time = cur_time saver.save(sess, checkpoint_dir + "sentiment", global_step=local_step) print(sess.run(model.emb)) print(sess.run(gradients[0])) print('ending...')
def run(): """This wrapper will handle the AutoDist destructor and garbage collections.""" try: atexit._clear( ) # TensorFlow also uses atexit, but running its exitfuncs cause some issues a = AutoDist(resource_spec_file=r, strategy_builder=s) # Fixtures in the future c.main(a) except Exception: raise finally: atexit._run_exitfuncs()
def train_criteo(model, args): resource_spec_file = os.path.join(os.path.dirname(__file__), 'settings', 'plx_dist_spec.yml') # resource_spec_file = os.path.join(os.path.dirname(__file__), 'settings', 'plx_local_spec.yml') autodist = AutoDist(resource_spec_file, Parallaxx()) respec = ResourceSpec(resource_spec_file) if args.all: from models.load_data import process_all_criteo_data dense, sparse, all_labels = process_all_criteo_data() dense_feature, val_dense = dense sparse_feature, val_sparse = sparse labels, val_labels = all_labels else: from models.load_data import process_sampled_criteo_data dense_feature, sparse_feature, labels = process_sampled_criteo_data() # autodist will split the feeding data batch_size = 128 with tf.Graph().as_default() as g, autodist.scope(): dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13]) sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26]) y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1]) embed_partitioner = tf.fixed_size_partitioner(len( respec.nodes), 0) if len(respec.nodes) > 1 else None loss, y, opt = model(dense_input, sparse_input, y_, embed_partitioner) train_op = opt.minimize(loss) sess = autodist.create_distributed_session() my_feed_dict = { dense_input: np.empty(shape=(batch_size, 13)), sparse_input: np.empty(shape=(batch_size, 26)), y_: np.empty(shape=(batch_size, 1)), } if args.all: raw_log_file = os.path.join( os.path.split(os.path.abspath(__file__))[0], 'logs', 'tf_plx_%s.log' % (args.model)) print('Processing all data, log to', raw_log_file) log_file = open(raw_log_file, 'w') iterations = dense_feature.shape[0] // batch_size total_epoch = 32 start_index = 0 for ep in range(total_epoch): # print("iters: %d" % (lp * 1000)) print("epoch %d" % ep) st_time = time.time() train_loss, train_acc, train_auc = [], [], [] for it in tqdm( range(iterations // 10 + (ep % 10 == 9) * (iterations % 10))): my_feed_dict[dense_input][:] = dense_feature[ start_index:start_index + batch_size] my_feed_dict[sparse_input][:] = sparse_feature[ start_index:start_index + batch_size] my_feed_dict[y_][:] = labels[start_index:start_index + batch_size] start_index += batch_size if start_index + batch_size > dense_feature.shape[0]: start_index = 0 loss_val = sess.run([loss, y, y_, train_op], feed_dict=my_feed_dict) pred_val = loss_val[1] true_val = loss_val[2] acc_val = np.equal(true_val, pred_val > 0.5) train_loss.append(loss_val[0]) train_acc.append(acc_val) train_auc.append(metrics.roc_auc_score(true_val, pred_val)) tra_accuracy = np.mean(train_acc) tra_loss = np.mean(train_loss) tra_auc = np.mean(train_auc) en_time = time.time() train_time = en_time - st_time printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\ % (tra_loss, tra_accuracy, tra_auc, train_time) print(printstr) log_file.write(printstr + '\n') log_file.flush() else: iteration = dense_feature.shape[0] // batch_size epoch = 50 for ep in range(epoch): print('epoch', ep) if ep == 5: start = time.time() ep_st = time.time() train_loss = [] train_acc = [] for idx in range(iteration): start_index = idx * batch_size my_feed_dict[dense_input][:] = dense_feature[ start_index:start_index + batch_size] my_feed_dict[sparse_input][:] = sparse_feature[ start_index:start_index + batch_size] my_feed_dict[y_][:] = labels[start_index:start_index + batch_size] loss_val = sess.run([loss, y, y_, train_op], feed_dict=my_feed_dict) pred_val = loss_val[1] true_val = loss_val[2] if pred_val.shape[1] == 1: # for criteo case acc_val = np.equal(true_val, pred_val > 0.5) else: acc_val = np.equal(np.argmax(pred_val, 1), np.argmax(true_val, 1)).astype(np.float) train_loss.append(loss_val[0]) train_acc.append(acc_val) tra_accuracy = np.mean(train_acc) tra_loss = np.mean(train_loss) ep_en = time.time() print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f" % (tra_loss, tra_accuracy, ep_en - ep_st)) print('all time:', (time.time() - start))
import os import sys import numpy as np import tensorflow as tf from autodist.strategy import PS ############################################################ # Step 1: Construct AutoDist with ResourceSpec from autodist import AutoDist autodist = AutoDist( resource_spec_file='resource_spec.yml', strategy_builder=PS(local_proxy_variable=False, sync=True, staleness=3) ) ############################################################ fashion_mnist = tf.keras.datasets.fashion_mnist (train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data() train_images = train_images[:512, :, :, None] test_images = test_images[:512, :, :, None] train_labels = train_labels[:512] test_labels = test_labels[:512] train_images = train_images / np.float32(255) test_images = test_images / np.float32(255) BATCH_SIZE = 64 EPOCHS = 1 ############################################################# # Step 2: Build with Graph mode, and put it under AutoDist scope with tf.Graph().as_default(), autodist.scope(): #############################################################
import os import time import tensorflow as tf import numpy as np from absl import app from tensorflow.core.protobuf import config_pb2 from autodist import AutoDist from autodist.strategy import PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax resource_spec_file = os.path.join(os.path.dirname(__file__), 'resource_spec.yml') autodist = AutoDist(resource_spec_file, PartitionedPS()) vocab_size = 10000 embedding_size = 16 hidden_dim = 16 max_steps = 101 batch_size = 128 log_frequency = 100 class SimpleModel(): def __init__(self): self.emb = tf.Variable(tf.random.uniform([vocab_size, embedding_size]), name='emb', trainable=True, dtype=tf.float32) self.w1 = tf.Variable(tf.random.uniform([embedding_size, hidden_dim]), name='w1', trainable=True,
def train_and_save(): """ Train the model and save the serialized model and its weights. """ autodist = AutoDist(resource_spec_file, AllReduce(128)) print('I am going to a scope.') with tf.Graph().as_default() as g, autodist.scope(): x = tf.compat.v1.placeholder(shape=[NUM_EXAMPLES], dtype=tf.float64) W = tf.Variable(5.0, name='W', dtype=tf.float64) b = tf.Variable(0.0, name='b', dtype=tf.float64) def y(): return W * x + b def l(predicted_y, desired_y): return tf.reduce_mean(tf.square(predicted_y - desired_y)) major_version, _, _ = tf.version.VERSION.split('.') if major_version == '1': optimizer = tf.train.GradientDescentOptimizer(0.01) else: optimizer = tf.optimizers.SGD(0.01) with tf.GradientTape() as tape: prediction = y() loss = l(prediction, outputs) vs = [W, b] gradients = tf.gradients(loss, vs) train_op = optimizer.apply_gradients(zip(gradients, vs)) ops.add_to_collection(TRAIN_OP_KEY, train_op) fetches = [loss, train_op, b, prediction] feeds = [x] # NOTE: The AutoDist saver should be declared before the wrapped session. saver = autodist_saver() session = autodist.create_distributed_session() for _ in range(EPOCHS): l, _, b, _ = session.run(fetches, feed_dict={feeds[0]: inputs}) print('node: {}, loss: {}\nb:{}'.format(autodist._cluster.get_local_address(), l, b)) print('I am out of scope') inputs_info = { "input_data": saved_model.utils.build_tensor_info(feeds[0]) } outputs_info = { "loss": saved_model.utils.build_tensor_info(fetches[0]), "prediction": saved_model.utils.build_tensor_info(fetches[3]) } serving_signature = saved_model.signature_def_utils.build_signature_def( inputs=inputs_info, outputs=outputs_info, method_name=saved_model.signature_constants.PREDICT_METHOD_NAME ) signature_map = { saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: serving_signature, } if os.path.exists(EXPORT_DIR): shutil.rmtree(EXPORT_DIR) builder = SavedModelBuilder(EXPORT_DIR) builder.add_meta_graph_and_variables( sess=session, tags=[TAG_NAME], saver=saver, signature_def_map=signature_map) builder.save()
def run(flags_obj): """ Run ResNet ImageNet training and eval loop using native Keras APIs. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ ######################################################################### # Construct AutoDist with ResourceSpec for Different Strategies if flags_obj.autodist_patch_tf: os.environ['AUTODIST_PATCH_TF'] = '1' else: os.environ['AUTODIST_PATCH_TF'] = '0' if flags_obj.cnn_model == 'vgg16': chunk = 25 elif flags_obj.cnn_model == 'resnet101': chunk = 200 elif flags_obj.cnn_model == 'inceptionv3': chunk = 30 else: chunk = 512 if flags_obj.autodist_strategy == 'PS': autodist = AutoDist(resource_spec_file, PS(local_proxy_variable=flags_obj.proxy)) elif flags_obj.autodist_strategy == 'PSLoadBalancing': autodist = AutoDist( resource_spec_file, PSLoadBalancing(local_proxy_variable=flags_obj.proxy)) elif flags_obj.autodist_strategy == 'PartitionedPS': autodist = AutoDist( resource_spec_file, PartitionedPS(local_proxy_variable=flags_obj.proxy)) elif flags_obj.autodist_strategy == 'AllReduce': autodist = AutoDist(resource_spec_file, AllReduce(chunk_size=chunk)) elif flags_obj.autodist_strategy == 'Parallax': autodist = AutoDist( resource_spec_file, Parallax(chunk_size=chunk, local_proxy_variable=flags_obj.proxy)) else: raise ValueError( 'the strategy can be only from PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax' ) ######################################################################### dtype = flags_core.get_tf_dtype(flags_obj) if dtype == tf.float16: loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128) policy = tf.compat.v1.keras.mixed_precision.experimental.Policy( 'mixed_float16', loss_scale=loss_scale) tf.compat.v1.keras.mixed_precision.experimental.set_policy(policy) if not keras_utils.is_v2_0(): raise ValueError('--dtype=fp16 is not supported in TensorFlow 1.') elif dtype == tf.bfloat16: policy = tf.compat.v1.keras.mixed_precision.experimental.Policy( 'mixed_bfloat16') tf.compat.v1.keras.mixed_precision.experimental.set_policy(policy) input_fn = imagenet_preprocessing.input_fn drop_remainder = flags_obj.enable_xla if 'vgg' in flags_obj.cnn_model: lr_schedule = 0.01 else: lr_schedule = 0.1 if flags_obj.use_tensor_lr: lr_schedule = common.PiecewiseConstantDecayWithWarmup( batch_size=flags_obj.batch_size, epoch_size=imagenet_preprocessing.NUM_IMAGES['train'], warmup_epochs=common.LR_SCHEDULE[0][1], boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]), multipliers=list(p[0] for p in common.LR_SCHEDULE), compute_lr_on_cpu=True) ######################################################################### # Build with Graph mode, and put all under AutoDist scope. with tf.Graph().as_default(), autodist.scope(): ########################################################################## train_input_dataset = input_fn( is_training=True, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=imagenet_preprocessing.parse_record, datasets_num_private_threads=flags_obj. datasets_num_private_threads, dtype=dtype, drop_remainder=drop_remainder, tf_data_experimental_slack=flags_obj.tf_data_experimental_slack, training_dataset_cache=flags_obj.training_dataset_cache, ) if flags_obj.cnn_model == 'resnet101': model = tf.keras.applications.ResNet101( weights=None, classes=imagenet_preprocessing.NUM_CLASSES) elif flags_obj.cnn_model == 'vgg16': model = tf.keras.applications.VGG16( weights=None, classes=imagenet_preprocessing.NUM_CLASSES) elif flags_obj.cnn_model == 'inceptionv3': model = tf.keras.applications.InceptionV3( weights=None, classes=imagenet_preprocessing.NUM_CLASSES) elif flags_obj.cnn_model == 'densenet121': model = tf.keras.applications.DenseNet121( weights=None, classes=imagenet_preprocessing.NUM_CLASSES) else: raise ValueError('Other Model Undeveloped') optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, beta_1=0.9, beta_2=0.999, epsilon=1e-08) train_input_iterator = tf.compat.v1.data.make_one_shot_iterator( train_input_dataset) train_input, train_target = train_input_iterator.get_next() steps_per_epoch = (imagenet_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size) train_epochs = flags_obj.train_epochs if flags_obj.enable_checkpoint_and_export: ckpt_full_path = os.path.join(flags_obj.model_dir, 'model.ckpt-{epoch:04d}') if train_epochs <= 1 and flags_obj.train_steps: steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch) train_epochs = 1 num_eval_steps = (imagenet_preprocessing.NUM_IMAGES['validation'] // flags_obj.batch_size) train_output = model(train_input, training=True) scc_loss = tf.keras.losses.SparseCategoricalCrossentropy() loss = scc_loss(train_target, train_output) var_list = variables.trainable_variables() + \ ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES) grad = optimizer.get_gradients(loss, var_list) train_op = optimizer.apply_gradients(zip(grad, var_list)) ##################################################################### # Create distributed session. # Instead of using the original TensorFlow session for graph execution, # let's use AutoDist's distributed session, in which a computational # graph for distributed training is constructed. # # [original line] # >>> sess = tf.compat.v1.Session() # sess = autodist.create_distributed_session() ##################################################################### summary = TimeHistory(flags_obj.batch_size, steps_per_epoch) for epoch_id in range(train_epochs): summary.on_epoch_begin(epoch_id) for batch_id in range(steps_per_epoch): summary.on_batch_begin(batch_id) loss_v, _ = sess.run([loss, train_op]) summary.on_batch_end(batch_id, loss_v) summary.on_epoch_end(epoch_id) summary.on_train_end() return
def main(_): assert tf.version.VERSION.startswith('2.') if not FLAGS.model_dir: FLAGS.model_dir = '/tmp/bert/' ######################################################################### # Construct AutoDist with ResourceSpec for Different Strategies if FLAGS.autodist_patch_tf: os.environ['AUTODIST_PATCH_TF'] = 'True' else: os.environ['AUTODIST_PATCH_TF'] = 'False' resource_spec_file = os.path.join(os.path.dirname(__file__), '../resource_spec.yml') if FLAGS.autodist_strategy == 'PS': strategy = AutoDist(resource_spec_file, PS(local_proxy_variable=FLAGS.proxy)) elif FLAGS.autodist_strategy == 'PSLoadBalancing': strategy = AutoDist(resource_spec_file, PSLoadBalancing(local_proxy_variable=FLAGS.proxy)) elif FLAGS.autodist_strategy == 'PartitionedPS': strategy = AutoDist(resource_spec_file, PartitionedPS(local_proxy_variable=FLAGS.proxy)) elif FLAGS.autodist_strategy == 'AllReduce': strategy = AutoDist(resource_spec_file, AllReduce(chunk_size=FLAGS.chunk_size)) elif FLAGS.autodist_strategy == 'Parallax': strategy = AutoDist( resource_spec_file, Parallax(chunk_size=FLAGS.chunk_size, local_proxy_variable=FLAGS.proxy)) else: raise ValueError( 'the strategy can be only from PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax' ) strategy.num_replicas_in_sync = strategy._resource_spec.num_gpus if strategy: print('***** Number of cores used : ', strategy.num_replicas_in_sync) resource_info = yaml.safe_load(open(resource_spec_file, 'r')) try: node_num = len(resource_info['nodes']) except ValueError: print("nodes need to be set in specficiation file") try: gpu_num = len(resource_info['nodes'][0]['gpus']) except ValueError: print("gpus need to be set in specficiation file") ######################################################################### logdir = '/tmp/logs' if not os.path.exists(logdir): os.makedirs(logdir) logname = 'bert_strategy_{}_node_{}_gpu_{}_patch_{}_proxy_{}'.format( FLAGS.autodist_strategy, node_num, gpu_num, FLAGS.autodist_patch_tf, FLAGS.proxy) logging.get_absl_handler().use_absl_log_file(logname, logdir) # start running run_bert_pretrain(strategy, gpu_num, node_num)
def train_main(args): resource_spec_file = os.path.join(os.path.dirname(__file__), '../../examples/ctr/settings', 'plx_dist_spec.yml') autodist = AutoDist(resource_spec_file, Parallaxx()) with open(os.path.join(args.path, "meta.yml"), 'rb') as f: meta = yaml.load(f.read(), Loader=yaml.FullLoader) hidden_layer_size = args.hidden_size num_epoch = args.num_epoch rank = ad.get_worker_communicate().rank() device_id = rank % args.num_local_worker nrank = ad.get_worker_communicate().nrank() distributed.ps_init(rank, nrank) ngraph = meta["partition"]["nodes"][rank] // args.batch_size graphs = prepare_data(ngraph) idx, epoch, nnodes = 0, 0, 0 worker_device = "gpu:0" graph_len = graphs[0][0].y.shape[0] with tf.Graph().as_default() as g, autodist.scope(): with tf.device(worker_device): norm_adj = tf.compat.v1.sparse.placeholder(tf.float32, name="norm_adj") sparse_feature = tf.placeholder(tf.int32, [graph_len, meta["feature"] - 1]) y_ = tf.placeholder(tf.int32, [graph_len], name="y_") train_mask = tf.placeholder(tf.float32, [graph_len], name="train_mask") loss, y, train_op = model(norm_adj, sparse_feature, y_, train_mask) # init=tf.global_variables_initializer() # gpu_options = tf.GPUOptions(allow_growth=True) # sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) sess = autodist.create_distributed_session() # sess.run(init) acc_cnt, total_cnt = 0, 0 train_acc, train_cnt = 0, 0 start = time.time() while True: g_sample, mp_val, mask, mask_eval = graphs[idx] idx = (idx + 1) % ngraph feed_dict = { norm_adj: mp_val, sparse_feature: g_sample.x[:, 0:-1], y_: g_sample.y, train_mask: mask } loss_val = sess.run([loss, y, train_op], feed_dict=feed_dict) pred_val = loss_val[1] acc_val = np.equal(np.argmax(pred_val, 1), g_sample.y).astype(np.float) acc_cnt += (acc_val * mask_eval).sum() total_cnt += mask_eval.sum() nnodes += mask.sum() + mask_eval.sum() train_acc += (acc_val * mask).sum() train_cnt += mask.sum() if nnodes > meta["partition"]["nodes"][rank] // 10: nnodes = 0 epoch += 1 print("Acc : ", acc_cnt / total_cnt, train_acc / train_cnt, "Time : ", time.time() - start) # print(pred_val) start = time.time() acc_cnt, total_cnt = 0, 0 train_acc, train_cnt = 0, 0 if epoch >= num_epoch: break
import language_model from autodist import AutoDist from autodist.strategy import PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax FLAGS = flags.FLAGS flags.DEFINE_string("logdir", "/tmp/lm1b", "Logging directory.") flags.DEFINE_string("datadir", "/tmp/dataset/lm1b", "Data directory.") flags.DEFINE_integer("eval_steps", 70, "Number of eval steps.") flags.DEFINE_integer('max_steps', 1000000, """Number of iterations to run for each workers.""") flags.DEFINE_integer('log_frequency', 100, """How many steps between two runop logs.""") resource_spec_file = os.path.join(os.path.dirname(__file__), '../resource_spec.yml') autodist = AutoDist(resource_spec_file, PS()) def gen_lm1b_train_dataset(file_pattern, num_step): """ Returns: The training dataset (tf.data.Dataset) that has been repeated and shuffled """ file_names = [] for file_name in glob.glob(file_pattern): file_names.append(file_name) if not file_names: raise ValueError # create dataset ops BUFFER_SIZE = 100000
def main(): resource_spec_file = os.path.join(os.path.dirname(__file__), '../ctr/settings', 'plx_dist_spec.yml') autodist = AutoDist(resource_spec_file, Parallaxx()) respec = ResourceSpec(resource_spec_file) def validate(): # validate phase hits, ndcgs = [], [] for idx in range(num_users): start_index = idx * 100 my_feed_dict = { user_input: testUserInput[start_index:start_index + 100], item_input: testItemInput[start_index:start_index + 100], } predictions = sess.run([y], feed_dict=my_feed_dict) map_item_score = { testItemInput[start_index + i]: predictions[0][i] for i in range(100) } # Evaluate top rank list ranklist = heapq.nlargest(topK, map_item_score, key=map_item_score.get) hr = getHitRatio(ranklist, testItemInput[start_index]) ndcg = getNDCG(ranklist, testItemInput[start_index]) hits.append(hr) ndcgs.append(ndcg) hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean() return hr, ndcg from movielens import getdata trainData, testData = getdata('ml-25m', 'datasets') testUserInput = np.repeat(np.arange(testData.shape[0], dtype=np.int32), 100) testItemInput = testData.reshape((-1, )) num_users, num_items = { 'ml-1m': (6040, 3706), 'ml-20m': (138493, 26744), 'ml-25m': (162541, 59047), }['ml-25m'] batch_size = 1024 num_negatives = 4 topK = 10 with tf.Graph().as_default() as g, autodist.scope(): user_input = tf.compat.v1.placeholder(tf.int32, [ None, ]) item_input = tf.compat.v1.placeholder(tf.int32, [ None, ]) y_ = tf.compat.v1.placeholder(tf.float32, [ None, ]) loss, y, opt = neural_mf(user_input, item_input, y_, num_users, num_items) train_op = opt.minimize(loss) # init = tf.compat.v1.global_variables_initializer() # gpu_options = tf.compat.v1.GPUOptions(allow_growth=True) sess = autodist.create_distributed_session() # sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)) # sess.run(init) log = Logging( path=os.path.join(os.path.dirname(__file__), 'logs', 'tfplx.txt')) epoch = 7 iterations = trainData['user_input'].shape[0] // batch_size start = time.time() for ep in range(epoch): ep_st = time.time() log.write('epoch %d' % ep) train_loss = [] for idx in range(iterations): start_index = idx * batch_size my_feed_dict = { user_input: trainData['user_input'][start_index:start_index + batch_size], item_input: trainData['item_input'][start_index:start_index + batch_size], y_: trainData['labels'][start_index:start_index + batch_size], } loss_val = sess.run([loss, train_op], feed_dict=my_feed_dict) train_loss.append(loss_val[0]) # if idx % 10000 == 0: # hr, ndcg = validate() # printstr = "HR: %.4f, NDCF: %.4f" % (hr, ndcg) # log.write(printstr) tra_loss = np.mean(train_loss) ep_en = time.time() # validate phase # hr, ndcg = validate() printstr = "train_loss: %.4f, train_time: %.4f" % (tra_loss, ep_en - ep_st) log.write(printstr) log.write('all time:', (time.time() - start))
import tensorflow as tf import tensorflow.contrib.layers as layers import numpy as np import os from tensorflow.python import debug as tf_debug from tensorflow.python.ops import math_ops from autodist import AutoDist from autodist.strategy import PS resource_spec_file = os.path.join(os.path.dirname(__file__), 'resource_spec.yml') autodist = AutoDist(resource_spec_file, PS()) tf.reset_default_graph() #network parameters n_input = 2000 #input size for a single sample (2000 words), train3000, test1000 #hyperparamters batch_size = 256 eta = 0.001 # learning rate max_epoch = 27 # 1. get data (using same dataset as keras example) from keras.datasets import imdb (train_data, train_labels), (test_data, test_labels) = tf.keras.datasets.imdb.load_data( num_words=n_input)
import os import numpy as np import tensorflow as tf ############################################################ # Step 1: Construct AutoDist with ResourceSpec from autodist import AutoDist filepath = os.path.join(os.path.dirname(__file__), 'resource_spec.yml') autodist = AutoDist(resource_spec_file=filepath) ############################################################ fashion_mnist = tf.keras.datasets.fashion_mnist (train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data() train_images = train_images[:512, :, :, None] test_images = test_images[:512, :, :, None] train_labels = train_labels[:512] test_labels = test_labels[:512] train_images = train_images / np.float32(255) test_images = test_images / np.float32(255) BATCH_SIZE = 64 EPOCHS = 1 ############################################################# # Step 2: Build with Graph mode, and put it under AutoDist scope with tf.Graph().as_default(), autodist.scope(): ############################################################# train_dataset = tf.data.Dataset.from_tensor_slices( (train_images, train_labels)).repeat(EPOCHS).shuffle(
def test_single_process(tmp_resource_spec): from autodist import AutoDist AutoDist(resource_spec_file=tmp_resource_spec) with pytest.raises(NotImplementedError): AutoDist(resource_spec_file=tmp_resource_spec)
def run_ncf(FLAGS): """Run NCF training and eval with Keras.""" ######################################################################### # Construct AutoDist with ResourceSpec for Different Strategies resource_spec_file = os.path.join( os.path.dirname(__file__), '../resource_spec.yml') resource_info = yaml.safe_load(open(resource_spec_file, 'r')) try: node_num = len(resource_info['nodes']) except ValueError: print("nodes need to be set in specficiation file") try: gpu_num = len(resource_info['nodes'][0]['gpus']) except ValueError: print("gpus need to be set in specficiation file") if FLAGS.autodist_patch_tf: os.environ['AUTODIST_PATCH_TF'] = '1' else: os.environ['AUTODIST_PATCH_TF'] = '0' if FLAGS.proxy: local_proxy_variable = True else: local_proxy_variable = False if FLAGS.autodist_strategy == 'PS': autodist = AutoDist( resource_spec_file, PS( local_proxy_variable=local_proxy_variable)) elif FLAGS.autodist_strategy == 'PSLoadBalancing': autodist = AutoDist(resource_spec_file, PSLoadBalancing( local_proxy_variable=local_proxy_variable)) elif FLAGS.autodist_strategy == 'PartitionedPS': autodist = AutoDist(resource_spec_file, PartitionedPS( local_proxy_variable=local_proxy_variable)) elif FLAGS.autodist_strategy == 'AllReduce': autodist = AutoDist(resource_spec_file, AllReduce(chunk_size=256)) elif FLAGS.autodist_strategy == 'Parallax': autodist = AutoDist( resource_spec_file, Parallax( chunk_size=256, local_proxy_variable=local_proxy_variable)) else: raise ValueError( 'the strategy can be only from PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax') ######################################################################### if FLAGS.seed is not None: print("Setting tf seed") tf.random.set_seed(FLAGS.seed) model_helpers.apply_clean(FLAGS) if FLAGS.dtype == "fp16" and FLAGS.fp16_implementation == "keras": policy = tf.keras.mixed_precision.experimental.Policy( "mixed_float16", loss_scale=flags_core.get_loss_scale( FLAGS, default_for_fp16="dynamic")) tf.keras.mixed_precision.experimental.set_policy(policy) params = ncf_common.parse_flags(FLAGS) params["distribute_strategy"] = None batch_size = params["batch_size"] time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps) callbacks = [time_callback] producer, input_meta_data = None, None generate_input_online = params["train_dataset_path"] is None if generate_input_online: num_users, num_items, _, _, producer = ncf_common.get_inputs(params) producer.start() per_epoch_callback = IncrementEpochCallback(producer) callbacks.append(per_epoch_callback) else: assert params["eval_dataset_path"] and params["input_meta_data_path"] with tf.io.gfile.GFile(params["input_meta_data_path"], "rb") as reader: input_meta_data = json.loads(reader.read().decode("utf-8")) num_users = input_meta_data["num_users"] num_items = input_meta_data["num_items"] params["num_users"], params["num_items"] = num_users, num_items if FLAGS.early_stopping: early_stopping_callback = CustomEarlyStopping( "val_HR_METRIC", desired_value=FLAGS.hr_threshold) callbacks.append(early_stopping_callback) with tf.Graph().as_default(), autodist.scope(): (train_input_dataset, eval_input_dataset, num_train_steps, num_eval_steps) = ( ncf_input_pipeline.create_ncf_input_data(params, producer, input_meta_data, None)) steps_per_epoch = None if generate_input_online else num_train_steps keras_model = _get_keras_model(params) if FLAGS.optimizer == 'adam': optimizer = tf.keras.optimizers.Adam( learning_rate=params["learning_rate"], beta_1=params["beta1"], beta_2=params["beta2"], epsilon=params["epsilon"]) elif FLAGS.optimizer == 'sgd': optimizer = tf.keras.optimizers.SGD( learning_rate=params["learning_rate"]) elif FLAGS.optimizer == 'lazyadam': optimizer = LazyAdam( learning_rate=params["learning_rate"], beta_1=params["beta1"], beta_2=params["beta2"], epsilon=params["epsilon"]) else: raise ValueError('Do not support other optimizers...') if FLAGS.fp16_implementation == "graph_rewrite": optimizer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale=flags_core.get_loss_scale(FLAGS, default_for_fp16="dynamic")) elif FLAGS.dtype == "fp16" and params["keras_use_ctl"]: optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, tf.keras.mixed_precision.experimental.global_policy().loss_scale) return run_ncf_custom_training( params, autodist, keras_model, optimizer, callbacks, train_input_dataset, eval_input_dataset, num_train_steps, num_eval_steps, generate_input_online=generate_input_online, return_simulation=FLAGS.simulation_strategy_id is not None)
import tensorflow.compat.v1 as tf from tensorflow.keras.layers import Dense from tensorflow.keras import Model from tensorflow.keras.datasets import imdb import numpy as np from autodist import AutoDist from autodist.strategy import PS autodist = AutoDist(resource_spec_file='resource_spec.yml', strategy_builder=PS(local_proxy_variable=False, sync=True, staleness=1) #strategy_builder=PS() ) d = autodist #network parameters n_input = 800 #input size for a single sample (800 words) #hyperparamters batch_size = 128 eta = 0.001 # learning rate max_epoch = 20 # 1. get data (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=800)
def run_test(resource, strategy, case): print("\n>>>>>>>> Running Test: Case:{}, Strategy:{}, ResourceSpec:{} >>>>>>>>\n".format(case, strategy, resource)) a = AutoDist(resource_spec_file=resource, strategy_builder=STRATEGIES_FOR_DISTRIBUTED_TESTS[strategy]) c = importlib.import_module("cases." + case) c.main(a) print('<<<<<<<<<< Test Case Finished. <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')