def init(self, task_id): ip = "127.0.0.1" port = "12345" port1 = "12346" if 0 == task_id: os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" elif 1 == task_id: os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7" else: raise RuntimeError("task_id can only be one of [0, 1].") os.environ['TF_CONFIG'] = json.dumps({ "cluster": { "worker": [ip + ":" + port, ip + ":" + port1] }, "task": { "type": "worker", "index": task_id } }) resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver() self.strategy = tf.distribute.MultiWorkerMirroredStrategy(resolver) with self.strategy.scope(): init_re = sok.Init()
def main(): global_batch_size = 1024 slot_num = 10 nnz_per_slot = 5 policy = tf.keras.mixed_precision.Policy("mixed_float16") tf.keras.mixed_precision.set_global_policy(policy) strategy = tf.distribute.MirroredStrategy() dataset = utility.get_dataset(global_batch_size, read_batchsize=global_batch_size) dataset = strategy.experimental_distribute_dataset(dataset) with strategy.scope(): sok.Init(global_batch_size=global_batch_size) model = utility.SOKDenseDemo(max_vocabulary_size_per_gpu=1024, embedding_vec_size=8, slot_num=slot_num, nnz_per_slot=nnz_per_slot, num_dense_layers=0) optimizer = tf.keras.optimizers.Adam(learning_rate=0.1) optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer) loss_fn = tf.keras.losses.BinaryCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE) def _replica_loss(labels, logits): labels = tf.cast(labels, logits.dtype) loss = loss_fn(labels, logits) dtype = loss.dtype loss = tf.cast(loss, tf.float32) loss = tf.nn.compute_average_loss(loss, global_batch_size=global_batch_size) return tf.cast(loss, dtype) @tf.function def train_step(inputs, labels): with tf.GradientTape() as tape: logit = model(inputs, training=True) loss = _replica_loss(labels, logit) scaled_loss = optimizer.get_scaled_loss(loss) emb_vars, other_vars =\ sok.split_embedding_variable_from_others(model.trainable_variables) scaled_emb_grads, scaled_other_grads = tape.gradient( scaled_loss, [emb_vars, other_vars]) emb_grads = optimizer.get_unscaled_gradients(scaled_emb_grads) other_grads = optimizer.get_unscaled_gradients(scaled_other_grads) with sok.OptimizerScope(emb_vars): optimizer.apply_gradients(zip(emb_grads, emb_vars), experimental_aggregate_gradients=False) optimizer.apply_gradients(zip(other_grads, other_vars)) return loss for step, (inputs, labels) in enumerate(dataset): replica_loss = strategy.run(train_step, args=(inputs, labels)) total_loss = strategy.reduce("sum", replica_loss, axis=None) print("[INFO]: step {}, loss {}".format(step, total_loss))
def main(args): comm_options = None if "mirrored" == args.distribute_strategy: avaiable_cuda_devices = ",".join( [str(gpu_id) for gpu_id in range(args.gpu_num)]) os.environ["CUDA_VISIBLE_DEVICES"] = avaiable_cuda_devices strategy = tf.distribute.MirroredStrategy() args.task_id = 0 elif "multiworker" == args.distribute_strategy: args.task_id = int(os.getenv("OMPI_COMM_WORLD_RANK")) os.environ["CUDA_VISIBLE_DEVICES"] = str(args.task_id) args.gpu_num = int(os.getenv("OMPI_COMM_WORLD_SIZE")) comm_options = tf.distribute.experimental.CommunicationOptions( bytes_per_pack=0, timeout_seconds=None, implementation=tf.distribute.experimental. CommunicationImplementation.NCCL) import json port = 12345 os.environ["TF_CONFIG"] = json.dumps({ "cluster": { "worker": [ "localhost" + ":" + str(port + i) for i in range(args.gpu_num) ] }, "task": { "type": "worker", "index": args.task_id } }) strategy = tf.distribute.MultiWorkerMirroredStrategy( communication_options=comm_options) elif "horovod" == args.distribute_strategy: import horovod.tensorflow as hvd hvd.Init() args.task_id = hvd.local_rank() args.gpu_num = hvd.size() os.environ["CUDA_VISIBLE_DEVICES"] = str(args.task_id) strategy = utils.NullStrategy() else: raise ValueError( "Not supported distribute_strategy. " f"Can only be one of ['mirrored', 'multiworker', 'horovod']" f", but got {args.distribute_strategy}") with strategy.scope(): if args.embedding_layer == "SOK": sok.Init(global_batch_size=args.global_batch_size) model = DLRM(vocab_size=args.vocab_size_list, num_dense_features=args.num_dense_features, embedding_layer=args.embedding_layer, embedding_vec_size=args.embedding_vec_size, bottom_stack_units=args.bottom_stack, top_stack_units=args.top_stack, TF_MP=args.TF_MP, comm_options=comm_options) lr_callable = utils.get_lr_callable( global_batch_size=args.global_batch_size, decay_exp=args.decay_exp, learning_rate=args.learning_rate, warmup_steps=args.warmup_steps, decay_steps=args.decay_steps, decay_start_steps=args.decay_start_steps) embedding_optimizer = utils.get_optimizer(args.embedding_optimizer) embedding_optimizer.learning_rate = lr_callable dense_optimizer = utils.get_optimizer("Adam") batch_size = args.global_batch_size if args.distribute_strategy == "mirrored" \ else args.global_batch_size // args.gpu_num if args.distribute_strategy != "mirrored": args.train_file_pattern = utils.shard_filenames( args.train_file_pattern, args.gpu_num, args.task_id) args.test_file_pattern = utils.shard_filenames(args.test_file_pattern, args.gpu_num, args.task_id) train_dataset = CriteoTsvReader(file_pattern=args.train_file_pattern, num_dense_features=args.num_dense_features, vocab_sizes=args.vocab_size_list, batch_size=batch_size) val_dataset = CriteoTsvReader(file_pattern=args.test_file_pattern, num_dense_features=args.num_dense_features, vocab_sizes=args.vocab_size_list, batch_size=batch_size) distribute_dataset = (args.distribute_strategy == "mirrored" and args.gpu_num > 1) train_dataset = utils.get_distribute_dataset( train_dataset, strategy, distribute_dataset=distribute_dataset) val_dataset = utils.get_distribute_dataset( val_dataset, strategy, distribute_dataset=distribute_dataset) val_dataset = iter(val_dataset) loss_fn = tf.keras.losses.BinaryCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE) def _replica_loss(labels, logits): loss = loss_fn(labels, logits) return tf.nn.compute_average_loss( loss, global_batch_size=args.global_batch_size) metrics = [ tf.keras.metrics.AUC(name="auc"), tf.keras.metrics.BinaryAccuracy(name="accuracy"), tf.keras.metrics.Mean("prediction_mean"), tf.keras.metrics.Mean("label_mean") ] metrics_threshold = {"auc": 0.8025} @tf.function def _train_step(features, labels, first_batch=False): with tf.GradientTape() as tape: logits = model(features, training=True) loss = _replica_loss(labels, logits) emb_vars, other_vars = utils.split_embedding_variables_from_others( model) emb_grads, other_grads = tape.gradient(loss, [emb_vars, other_vars]) with tf.control_dependencies([logits] + emb_grads): utils.apply_gradients(embedding_optimizer, emb_vars, emb_grads, args.embedding_layer == "SOK", aggregate_gradients=(not args.TF_MP)) other_grads = utils.all_reduce(other_grads, combiner="sum", comm_options=comm_options) utils.apply_gradients(dense_optimizer, other_vars, other_grads, False) if first_batch: utils.broadcast_variables(other_vars) utils.broadcast_variables(dense_optimizer.variables()) if args.embedding_layer == "TF": utils.broadcast_variables(emb_vars) utils.broadcast_variables(embedding_optimizer.variables()) total_loss = utils.all_reduce(loss, combiner="sum", comm_options=comm_options) return total_loss @tf.function def _val_step(features, labels, metrics): val_logits = model(features, training=False) val_loss = _replica_loss(labels, val_logits) val_loss = utils.all_reduce(val_loss, combiner="sum", comm_options=comm_options) labels = tf.identity(labels) val_logits = utils.all_gather(val_logits, axis=0, comm_options=comm_options) labels = utils.all_gather(labels, axis=0, comm_options=comm_options) return val_logits, labels, val_loss stopper = utils.EarlyStopper() begin_time = time.time() start_time = begin_time for i, (features, labels) in enumerate(train_dataset): if i >= args.train_steps: break if stopper.should_stop(): print(stopper.stop_reason) break total_loss = strategy.run(_train_step, args=(features, labels, i == 0)) if i % args.validation_interval == 0 and i != 0: val_features, val_labels = next(val_dataset) val_logits, val_labels, val_loss =\ strategy.run(_val_step, args=(val_features, val_labels, metrics)) if hasattr(val_labels, "values"): val_labels = val_labels.values[0] val_logits = val_logits.values[0] update_metrics_states(y_true=val_labels, y_pred=val_logits, metrics=metrics) val_logs = train_loop_end(metrics, total_loss, val_loss, embedding_optimizer, dense_optimizer, global_step=i) elapsed_time = time.time() - begin_time steps_sec = args.validation_interval / elapsed_time utils.show_logs(val_logs, strategy, elapsed_time, steps_sec, metrics_threshold, stopper) begin_time = time.time() end_time = time.time() if args.task_id == 0: print( f"With {args.distribute_strategy} + {args.embedding_layer} embedding layer, " f"on {args.gpu_num} GPUs, and global_batch_size is {args.global_batch_size}, " f"it takes {end_time - start_time} seconds to " f"finish {args.train_steps} steps training for DLRM.")
def test_sok_dense_demo(args, init_tensors, *random_samples): port = 12345 os.environ["TF_CONFIG"] = json.dumps({ "cluster": {"worker": [args.ips[i] + ":" + str(port + i) for i in range(args.worker_num)]}, "task": {"type": "worker", "index": args.task_id} }) strategy = tf.distribute.MultiWorkerMirroredStrategy() with strategy.scope(): sok.Init(global_batch_size=args.global_batch_size) sok_dense_demo = SOKDenseDemo(max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu, embedding_vec_size=args.embedding_vec_size, slot_num=args.slot_num, nnz_per_slot=args.nnz_per_slot, use_hashtable=args.use_hashtable) emb_opt = utils.get_embedding_optimizer(args.optimizer)(learning_rate=0.1) dense_opt = utils.get_dense_optimizer(args.optimizer)(learning_rate=0.1) if args.mixed_precision: emb_opt = tf.keras.mixed_precision.LossScaleOptimizer(emb_opt, initial_scale=1024) sok_saver = sok.Saver() if 1 == args.restore_params: filepath = r"./embedding_variables" sok_saver.restore_from_file(sok_dense_demo.embedding_layer.embedding_variable, filepath) else: sok_saver.load_embedding_values(sok_dense_demo.embedding_layer.embedding_variable, init_tensors) loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE) def _replica_loss(labels, logits): loss = loss_fn(labels, logits) _dtype = loss.dtype loss = tf.cast(loss, tf.float32) loss = tf.nn.compute_average_loss(loss, global_batch_size=args.global_batch_size) return tf.cast(loss, _dtype) @tf.function def _train_step(inputs, labels): with tf.GradientTape() as tape: logit, embedding_vector = sok_dense_demo(inputs, training=True) loss = _replica_loss(labels, logit) if args.mixed_precision: _loss = emb_opt.get_scaled_loss(loss) else: _loss = loss embedding_variables, other_variable = sok.split_embedding_variable_from_others(sok_dense_demo.trainable_variables) grads, emb_grads = tape.gradient(_loss, [other_variable, embedding_variables]) if args.mixed_precision: grads = emb_opt.get_unscaled_gradients(grads) emb_grads = emb_opt.get_unscaled_gradients(emb_grads) if "plugin" not in args.optimizer: with sok.OptimizerScope(embedding_variables): emb_opt.apply_gradients(zip(emb_grads, embedding_variables), experimental_aggregate_gradients=False) else: emb_opt.apply_gradients(zip(emb_grads, embedding_variables), experimental_aggregate_gradients=False) dense_opt.apply_gradients(zip(grads, other_variable)) return loss, embedding_vector sok_results = list() def _dataset_fn(input_context): replica_batch_size = input_context.get_per_replica_batch_size(args.global_batch_size) dataset = utils.tf_dataset(*random_samples, batchsize=replica_batch_size, to_sparse_tensor=False, repeat=1) return dataset dataset = strategy.distribute_datasets_from_function(_dataset_fn) for i, (input_tensors, replica_labels) in enumerate(dataset): print("-"*30, "step ", str(i), "-"*30) loss, embedding_vector = strategy.run(_train_step, args=(input_tensors, replica_labels)) loss = strategy.reduce("sum", loss, axis=None) print("[INFO]: iteration {}, loss {}".format(i, loss)) sok_results.append(embedding_vector) # save params to file. if 1 == args.save_params: filepath = r"./embedding_variables" utils.try_make_dirs(filepath, chief=(True if args.task_id == 0 else False)) sok_saver.dump_to_file(sok_dense_demo.embedding_layer.embedding_variable, filepath) return sok_results, sok_dense_demo.embedding_layer.embedding_variable.values[0].m_var_name
def run_sok_model(args, dense_variables, vocabulary_tensors, samples, labels): # split sample and labels assert (args.global_batch_size % hvd.size() == 0) local_batch_size = args.global_batch_size // hvd.size() local_id = hvd.local_rank() samples = samples[local_id * local_batch_size:(local_id + 1) * local_batch_size] labels = labels[local_id * local_batch_size:(local_id + 1) * local_batch_size] sok.Init(global_batch_size=args.global_batch_size) model = SOKDenseDemo( max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu, embedding_vec_size=args.embedding_vec_size, slot_num=args.slot_num, nnz_per_slot=args.nnz_per_slot, num_dense_layers=args.num_dense_layers, num_dense_units=args.num_dense_units) #model.build(input_shape=(local_batch_size, args.slot_num * args.nnz_per_slot * args.embedding_vec_size)) model(samples, training=False) for i in range(args.num_dense_layers): model.dense_layers[i].trainable_variables[0].assign( dense_variables[0][i]) model.dense_layers[i].trainable_variables[1].assign( dense_variables[1][i]) sok_saver = sok.Saver() init_tensors = [tensor.numpy() for tensor in vocabulary_tensors] sok_saver.load_embedding_values(model.embedding_layer.embedding_variable, init_tensors) embedding_optimizer = utils.get_embedding_optimizer( args.optimizer)(learning_rate=0.1) dense_optimizer = utils.get_dense_optimizer( args.optimizer)(learning_rate=0.1) if args.mixed_precision: embedding_optimizer = tf.keras.mixed_precision.LossScaleOptimizer( embedding_optimizer, initial_scale=1024) loss_fn = tf.keras.losses.BinaryCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE) def _replica_loss(labels, logits): loss = loss_fn(labels, logits) _dtype = loss.dtype loss = tf.cast(loss, tf.float32) loss = tf.nn.compute_average_loss( loss, global_batch_size=args.global_batch_size) return tf.cast(loss, dtype=_dtype) @tf.function def _train_step(inputs, labels, first_batch): with tf.GradientTape() as tape, tf.GradientTape() as emb_tape: logit = model(inputs, training=True) replica_loss = _replica_loss(labels, logit) if args.mixed_precision: _loss = embedding_optimizer.get_scaled_loss(replica_loss) else: _loss = replica_loss tape = hvd.DistributedGradientTape(tape) emb_variable, other_variable = sok.split_embedding_variable_from_others( model.trainable_variables) emb_grads = emb_tape.gradient(_loss, emb_variable) grads = tape.gradient(_loss, other_variable) if args.mixed_precision: emb_grads = embedding_optimizer.get_unscaled_gradients(emb_grads) grads = embedding_optimizer.get_unscaled_gradients(grads) if 'plugin' not in args.optimizer: with sok.OptimizerScope(emb_variable): embedding_optimizer.apply_gradients( zip(emb_grads, emb_variable), experimental_aggregate_gradients=False) else: embedding_optimizer.apply_gradients( zip(emb_grads, emb_variable), experimental_aggregate_gradients=False) dense_optimizer.apply_gradients(zip(grads, other_variable)) # Note: broadcast should be done after the first gradient step to ensure optimizer initialization. if first_batch: hvd.broadcast_variables(other_variable, root_rank=0) hvd.broadcast_variables(dense_optimizer.variables(), root_rank=0) return replica_loss loss_list = [] for i in range(args.iter_num): loss = _train_step(samples, labels, i == 0) loss_list.append(loss) print("[INFO]: Iteration: {}, loss={}".format(i, loss)) return loss_list
def main(): global_batch_size = 1024 slot_num = 10 nnz_per_slot = 5 from tensorflow.python.keras.engine import base_layer_utils base_layer_utils.enable_v2_dtype_behavior() policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16") tf.keras.mixed_precision.experimental.set_policy(policy) dataset = utility.get_dataset(global_batch_size//hvd.size(), read_batchsize=global_batch_size//hvd.size()) sok_init_op = sok.Init(global_batch_size=global_batch_size) model = utility.SOKDenseDemo(max_vocabulary_size_per_gpu=1024, embedding_vec_size=8, slot_num=slot_num, nnz_per_slot=nnz_per_slot, num_dense_layers=0) optimizer = tf.keras.optimizers.Adam(learning_rate=0.1) optimizer = sok.tf.keras.mixed_precision.LossScaleOptimizer(optimizer, 1024) loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction='none') def _replica_loss(labels, logits): loss = loss_fn(labels, logits) dtype = loss.dtype loss = tf.cast(loss, tf.float32) loss = tf.nn.compute_average_loss(loss, global_batch_size=global_batch_size) return tf.cast(loss, dtype) def train_step(inputs, labels): logit = model(inputs, training=True) loss = _replica_loss(labels, logit) scaled_loss = optimizer.get_scaled_loss(loss) scaled_gradients = tf.gradients(scaled_loss, model.trainable_variables) emb_vars, other_vars =\ sok.split_embedding_variable_from_others(model.trainable_variables) scaled_emb_grads, scaled_other_grads =\ scaled_gradients[:len(emb_vars)], scaled_gradients[len(emb_vars):] emb_grads = optimizer.get_unscaled_gradients(scaled_emb_grads) other_grads = optimizer.get_unscaled_gradients(scaled_other_grads) other_grads = [hvd.allreduce(grad) for grad in other_grads] with sok.OptimizerScope(emb_vars): emb_train_op = optimizer.apply_gradients(zip(emb_grads, emb_vars)) other_train_op = optimizer.apply_gradients(zip(other_grads, other_vars)) total_loss = hvd.allreduce(loss) with tf.control_dependencies([emb_train_op, other_train_op]): return tf.identity(total_loss) train_iterator = dataset.make_initializable_iterator() iterator_init = train_iterator.initializer inputs, labels = train_iterator.get_next() loss = train_step(inputs, labels) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) with tf.Session() as sess: sess.run(sok_init_op) sess.run([init_op, iterator_init]) for step in range(10): loss_v = sess.run(loss) if hvd.local_rank() == 0: print("[INFO]: step {}, loss {}".format(step, loss_v))
my_affinity = affinity_map[rank] os.sched_setaffinity(0, my_affinity) if __name__ == '__main__': if args.amp: print('[Info] use amp mode') policy = tf.keras.mixed_precision.Policy('mixed_float16') tf.keras.mixed_precision.set_global_policy(policy) hvd.init() # set_affinity(hvd.rank()) global_batch_size = args.global_batch_size sok.Init(global_batch_size=global_batch_size) with open(os.path.join(args.data_dir, 'train/metadata.json'), 'r') as f: metadata = json.load(f) print(metadata) model = DLRM( metadata['vocab_sizes'], num_dense_features=13, embedding_vec_size=128, bottom_stack_units=[512, 256, 128], top_stack_units=[1024, 1024, 512, 256, 1], num_gpus=hvd.size(), use_cuda_interact=args.custom_interact, compress=args.compress, )
def test_sok_multi_dense_emb(args): comm_options = tf.distribute.experimental.CommunicationOptions( bytes_per_pack=0, timeout_seconds=None, implementation=tf.distribute.experimental.CommunicationImplementation. NCCL) if args.worker_num == 1: strategy = tf.distribute.MirroredStrategy() else: port = 12345 os.environ["TF_CONFIG"] = json.dumps({ "cluster": { "worker": [ "localhost" + ":" + str(port + i) for i in range(args.worker_num) ] }, "task": { "type": "worker", "index": args.task_id } }) strategy = tf.distribute.MultiWorkerMirroredStrategy( communication_options=comm_options) replica_batch_size = args.global_batch_size // (args.worker_num * 1) dataset = utility.TFDataset(filename=args.file_prefix + str(args.task_id) + ".file", batchsize=replica_batch_size, as_sparse_tensor=False, repeat=1) dataset = dataset.prefetch(tf.data.AUTOTUNE) dynamic_input = True if args.dynamic_input == 1 else False with strategy.scope(): sok.Init(global_batch_size=args.global_batch_size) model = SOKDenseModel( max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu, embedding_vec_size_list=args.embedding_vec_size_list, slot_num_list=args.slot_num_list, nnz_per_slot_list=[ args.nnz_per_slot for _ in range(len(args.slot_num_list)) ], num_dense_layers=args.num_dense_layers, dynamic_input=dynamic_input) emb_opt = utils.get_embedding_optimizer( args.optimizer)(learning_rate=0.1) dense_opt = utils.get_dense_optimizer( args.optimizer)(learning_rate=0.1) if args.mixed_precision: emb_opt = tf.keras.mixed_precision.LossScaleOptimizer( emb_opt, initial_scale=1024) # set initial value to embedding variables. sok_saver = sok.Saver() for i, layer in enumerate(model.embedding_layers): init_tensors = utils.get_ones_tensor( max_vocab_size_per_gpu=args.max_vocabulary_size_per_gpu, embedding_vec_size=args.embedding_vec_size_list[i], num=args.worker_num) sok_saver.load_embedding_values(layer.embedding_variable, init_tensors) loss_fn = tf.keras.losses.BinaryCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE) def _replica_loss(labels, logits): loss = loss_fn(labels, logits) _dtype = loss.dtype loss = tf.cast(loss, tf.float32) loss = tf.nn.compute_average_loss( loss, global_batch_size=args.global_batch_size) return tf.cast(loss, _dtype) @tf.function def _train_step(inputs, labels): with tf.GradientTape() as tape: logit, all_vectors = model(inputs, training=True) loss = _replica_loss(labels, logit) if args.mixed_precision: _loss = emb_opt.get_scaled_loss(loss) else: _loss = loss emb_variable, other_variable = sok.split_embedding_variable_from_others( model.trainable_variables) grads, emb_grads = tape.gradient(_loss, [other_variable, emb_variable]) if args.mixed_precision: grads = emb_opt.get_unscaled_gradients(grads) emb_grads = emb_opt.get_unscaled_gradients(emb_grads) if "plugin" not in args.optimizer: with sok.OptimizerScope(emb_variable): emb_opt.apply_gradients(zip(emb_grads, emb_variable), experimental_aggregate_gradients=False) else: emb_opt.apply_gradients(zip(emb_grads, emb_variable), experimental_aggregate_gradients=False) with tf.control_dependencies(emb_grads): # mannually all-reduce dense gradients replica_context = tf.distribute.get_replica_context() grads = replica_context.all_reduce("sum", grads, options=comm_options) dense_opt.apply_gradients(zip(grads, other_variable), experimental_aggregate_gradients=False) # manually all-reduce loss, it is ok, because replica_loss has already been used to # update local variables. loss = replica_context.all_reduce(tf.distribute.ReduceOp.SUM, loss, options=comm_options) return loss, all_vectors, logit # save its results sok_results = list() for i, (inputs, labels) in enumerate(dataset): if args.stop_iter >= 0 and i >= args.stop_iter: break total_loss, all_vectors, logit = strategy.run(_train_step, args=(inputs, labels)) print("[INFO]: Iteration: {}, loss={}".format(i, total_loss)) with tf.device("CPU:0"): sok_results.append(all_vectors) return sok_results
def test_sok_demo(args, init_tensors, *random_samples): strategy = tf.distribute.MirroredStrategy() with strategy.scope(): result = sok.Init(global_batch_size=args.global_batch_size) embedding_initializer = tf.keras.initializers.Ones( ) if args.use_tf_initializer else None plugin_demo = SOKDemo( combiner=args.combiner, max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu, slot_num=args.slot_num, max_nnz=args.max_nnz, embedding_vec_size=args.embedding_vec_size, use_hashtable=args.use_hashtable, key_dtype=args.key_dtype, embedding_initializer=embedding_initializer) emb_opt = utils.get_embedding_optimizer( args.optimizer)(learning_rate=0.1) dense_opt = utils.get_dense_optimizer( args.optimizer)(learning_rate=0.1) if args.mixed_precision: emb_opt = tf.keras.mixed_precision.LossScaleOptimizer( emb_opt, initial_scale=1024) plugin_saver = sok.Saver() if (1 == args.restore_params): # restore from trained parameters filepath = r"./embedding_variables" plugin_saver.restore_from_file( plugin_demo.embedding_layer.embedding_variable, filepath) else: # initialize using randomized initial value if not args.use_tf_initializer and init_tensors: status = plugin_saver.load_embedding_values( plugin_demo.embedding_layer.embedding_variable, init_tensors) loss_fn = tf.keras.losses.BinaryCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE) def _replica_loss(labels, logits): loss = loss_fn(labels, logits) _dtype = loss.dtype loss = tf.cast(loss, tf.float32) loss = tf.nn.compute_average_loss( loss, global_batch_size=args.global_batch_size) return tf.cast(loss, _dtype) @tf.function def _train_step(inputs, labels): with tf.GradientTape() as tape: logit, embedding_vector = plugin_demo(inputs, training=True) loss = _replica_loss(labels, logit) if args.mixed_precision: _loss = emb_opt.get_scaled_loss(loss) else: _loss = loss embedding_variables, other_variable = sok.split_embedding_variable_from_others( plugin_demo.trainable_variables) grads, emb_grads = tape.gradient(_loss, [other_variable, embedding_variables]) if args.mixed_precision: grads = emb_opt.get_unscaled_gradients(grads) emb_grads = emb_opt.get_unscaled_gradients(emb_grads) with tf.control_dependencies([*emb_grads]): # in case NCCL runs concurrently via SOK and TF if 'plugin' not in args.optimizer: with sok.OptimizerScope(embedding_variables): emb_opt.apply_gradients( zip(emb_grads, embedding_variables), experimental_aggregate_gradients=False) else: emb_opt.apply_gradients(zip(emb_grads, embedding_variables), experimental_aggregate_gradients=False) dense_opt.apply_gradients(zip(grads, other_variable)) return loss, embedding_vector sok_results = list() def _dataset_fn(input_context): replica_batch_size = input_context.get_per_replica_batch_size( args.global_batch_size) dataset = utils.tf_dataset(*random_samples, batchsize=replica_batch_size, to_sparse_tensor=True, repeat=1, args=args) dataset = dataset.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) return dataset dataset = strategy.distribute_datasets_from_function(_dataset_fn) for i, (sparse_tensors, replica_labels) in enumerate(dataset): print("-" * 30, "step ", str(i), "-" * 30) loss, embedding_vector = strategy.run(_train_step, args=(sparse_tensors, replica_labels)) loss = strategy.reduce("sum", loss, axis=None) print("[INFO]: iteration {}, loss {}".format(i, loss)) sok_results.append(embedding_vector) # save params to file. if 1 == args.save_params: filepath = r"./embedding_variables/" utils.try_make_dirs(filepath) plugin_saver.dump_to_file( plugin_demo.embedding_layer.embedding_variable, filepath) return sok_results, plugin_demo.embedding_layer.embedding_variable.values[ 0].m_var_name
def get_sok_results(args, init_tensors, *random_samples): if args.distributed_tool == "onedevice": strategy = strategy_wrapper.OneDeviceStrategy() elif args.distributed_tool == "horovod": import horovod.tensorflow as hvd hvd.init() strategy = strategy_wrapper.HorovodStrategy() else: raise ValueError(f"{args.distributed_tool} is not supported.") with strategy.scope(): sok_init_op = sok.Init(global_batch_size=args.global_batch_size) embedding_initializer = tf.keras.initializers.Ones( ) if args.use_tf_initializer else None sok_dense_demo = SOKDemo( max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu, embedding_vec_size=args.embedding_vec_size, slot_num=args.slot_num, nnz_per_slot=args.nnz_per_slot, use_hashtable=args.use_hashtable, dynamic_input=args.dynamic_input, num_of_dense_layers=0, key_dtype=args.key_dtype, embedding_initializer=embedding_initializer) emb_opt = utils.get_embedding_optimizer( args.optimizer)(learning_rate=0.1) dense_opt = utils.get_dense_optimizer( args.optimizer)(learning_rate=0.1) if args.mixed_precision: emb_opt = sok.tf.keras.mixed_precision.LossScaleOptimizer( emb_opt, 1024) sok_saver = sok.Saver() restore_op = list() for i, embedding_layer in enumerate(sok_dense_demo.embedding_layers): control_inputs = [restore_op[-1]] if restore_op else None with tf.control_dependencies(control_inputs): if args.restore_params: filepath = r"./embedding_variables" op = sok_saver.restore_from_file( embedding_layer.embedding_variable, filepath) else: if not args.use_tf_initializer: op = sok_saver.load_embedding_values( embedding_layer.embedding_variable, init_tensors[i]) else: op = tf.constant(1.0) restore_op.append(op) loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction='none') def _replica_loss(labels, logits): loss = loss_fn(labels, logits) _dtype = loss.dtype loss = tf.cast(loss, tf.float32) loss = tf.nn.compute_average_loss( loss, global_batch_size=args.global_batch_size) return tf.cast(loss, _dtype) def _train_step(inputs, labels, training): def _step_fn(inputs, labels): logit, embedding_vector = sok_dense_demo(inputs, training=training) loss = _replica_loss(labels, logit) if args.mixed_precision: _loss = emb_opt.get_scaled_loss(loss) else: _loss = loss emb_var, other_var = sok.split_embedding_variable_from_others( sok_dense_demo.trainable_variables) grads = tf.gradients( _loss, emb_var + other_var, colocate_gradients_with_ops=True, unconnected_gradients=tf.UnconnectedGradients.NONE) emb_grads, other_grads = grads[:len(emb_var)], grads[len(emb_var):] if args.mixed_precision: other_grads = emb_opt.get_unscaled_gradients(other_grads) emb_grads = emb_opt.get_unscaled_gradients(emb_grads) if "plugin" in args.optimizer: emb_train_op = emb_opt.apply_gradients(zip(emb_grads, emb_var)) else: with sok.OptimizerScope(emb_var): emb_train_op = emb_opt.apply_gradients( zip(emb_grads, emb_var)) with tf.control_dependencies([*emb_grads]): # in case NCCL runs concurrently via SOK and horovod other_grads = strategy.reduce("sum", other_grads) other_train_op = dense_opt.apply_gradients( zip(other_grads, other_var)) with tf.control_dependencies([emb_train_op, other_train_op]): total_loss = strategy.reduce("sum", loss) total_loss = tf.identity(total_loss) return total_loss, embedding_vector return strategy.run(_step_fn, inputs, labels) replica_batch_size = args.global_batch_size // args.gpu_num dataset = utils.tf_dataset(*random_samples, batchsize=replica_batch_size, to_sparse_tensor=False, repeat=1, args=args) train_iterator = dataset.make_initializable_iterator() iterator_init = train_iterator.initializer inputs, labels = train_iterator.get_next() graph_results = _train_step(inputs, labels, training=True) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) if "plugin" in args.optimizer: init_op = tf.group(init_op, emb_opt.initializer) save_op = list() for i, embedding_layer in enumerate(sok_dense_demo.embedding_layers): control_inputs = [save_op[-1]] if save_op else None with tf.control_dependencies(control_inputs): if args.save_params: filepath = r"./embedding_variables/" utils.try_make_dirs(filepath) op = sok_saver.dump_to_file(embedding_layer.embedding_variable, filepath) else: op = tf.constant(1.0) save_op.append(op) sok_results = list() config = tf.ConfigProto() config.log_device_placement = False with tf.Session(config=config) as sess: sess.run(sok_init_op) sess.run([init_op, iterator_init]) sess.run(restore_op) sess.graph.finalize() for step in range(args.iter_num): loss_v, emb_vector_v = sess.run([*graph_results]) print("*" * 80) print(f"Step: {step}, loss: {loss_v}" ) #", embedding_vector:\n{emb_vector_v}") sok_results.append(emb_vector_v) sess.run(save_op) name = list() for embedding_layer in sok_dense_demo.embedding_layers: name.append(embedding_layer.embedding_variable.m_var_name) return sok_results, name
def __init__(self, **kwargs): print("[INFO]: single worker testing.") self.strategy = tf.distribute.MirroredStrategy() with self.strategy.scope(): init_re = sok.Init(**kwargs)
def test_sok_multi_dense_emb(args): assert (args.global_batch_size % args.worker_num == 0) replica_batch_size = args.global_batch_size // (args.worker_num) dataset = utility.TFDataset(filename=args.file_prefix + str(args.task_id) + ".file", batchsize=replica_batch_size, as_sparse_tensor=False, repeat=1) dataset = dataset.prefetch(tf.data.AUTOTUNE) dynamic_input = True if args.dynamic_input == 1 else False # SOK initialize sok.Init(global_batch_size=args.global_batch_size) model = SOKDenseModel( max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu, embedding_vec_size_list=args.embedding_vec_size_list, slot_num_list=args.slot_num_list, nnz_per_slot_list=[ args.nnz_per_slot for _ in range(len(args.slot_num_list)) ], num_dense_layers=args.num_dense_layers, dynamic_input=dynamic_input, use_hashtable=args.use_hashtable) emb_opt = utils.get_embedding_optimizer(args.optimizer)(learning_rate=0.1) dense_opt = utils.get_dense_optimizer(args.optimizer)(learning_rate=0.1) if args.mixed_precision: emb_opt = tf.keras.mixed_precision.LossScaleOptimizer( emb_opt, initial_scale=1024) sok_saver = sok.Saver() for i, layer in enumerate(model.embedding_layers): init_tensors = utils.get_ones_tensor( max_vocab_size_per_gpu=args.max_vocabulary_size_per_gpu, embedding_vec_size=args.embedding_vec_size_list[i], num=args.worker_num) sok_saver.load_embedding_values(layer.embedding_variable, init_tensors) loss_fn = tf.keras.losses.BinaryCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE) def _replica_loss(labels, logits): loss = loss_fn(labels, logits) _dtype = loss.dtype loss = tf.cast(loss, tf.float32) loss = tf.nn.compute_average_loss( loss, global_batch_size=args.global_batch_size) return tf.cast(loss, _dtype) @tf.function def _train_step(inputs, labels, first_batch): with tf.GradientTape() as tape: logit, all_vectors = model(inputs, training=True) replica_loss = _replica_loss(labels, logit) if args.mixed_precision: _loss = emb_opt.get_scaled_loss(replica_loss) else: _loss = replica_loss emb_var, other_var = sok.split_embedding_variable_from_others( model.trainable_variables) emb_grads, grads = tape.gradient(_loss, [emb_var, other_var]) if args.mixed_precision: emb_grads = emb_opt.get_unscaled_gradients(emb_grads) grads = emb_opt.get_unscaled_gradients(grads) if "plugin" not in args.optimizer: with sok.OptimizerScope(emb_var): emb_opt.apply_gradients(zip(emb_grads, emb_var), experimental_aggregate_gradients=False) else: emb_opt.apply_gradients(zip(emb_grads, emb_var), experimental_aggregate_gradients=False) with tf.control_dependencies(emb_grads): grads = [hvd.allreduce(grad) for grad in grads] dense_opt.apply_gradients(zip(grads, other_var)) if first_batch: hvd.broadcast_variables(other_var, root_rank=0) hvd.broadcast_variables(dense_opt.variables(), root_rank=0) total_loss = hvd.allreduce(replica_loss) return total_loss, all_vectors sok_results = list() for i, (inputs, labels) in enumerate(dataset): if args.stop_iter >= 0 and i >= args.stop_iter: break total_loss, all_vectors = _train_step(inputs, labels, 0 == i) print("[INFO]: Iteration: {}, loss={}".format(i, total_loss)) sok_results.append(all_vectors) return sok_results