def __init__(self, batch_size, gpus, init_value, name_, embedding_type, optimizer_type, max_vocabulary_size_per_gpu, opt_hparams, update_type, atomic_update, scaler, slot_num, max_nnz, max_feature_num, embedding_vec_size, combiner, num_dense_layers, input_buffer_reset=False): super(PluginSparseModel, self).__init__() self.num_dense_layers = num_dense_layers self.input_buffer_reset = input_buffer_reset self.batch_size = batch_size self.slot_num = slot_num self.embedding_vec_size = embedding_vec_size self.gpus = gpus hugectr_tf_ops_v2.init(visible_gpus=gpus, seed=0, key_type='int64', value_type='float', batch_size=batch_size, batch_size_eval=len(gpus)) self.embedding_name = hugectr_tf_ops_v2.create_embedding(init_value=init_value, name_=name_, embedding_type=embedding_type, optimizer_type=optimizer_type, max_vocabulary_size_per_gpu=max_vocabulary_size_per_gpu, opt_hparams=opt_hparams, update_type=update_type, atomic_update=atomic_update, scaler=scaler, slot_num=slot_num, max_nnz=max_nnz, max_feature_num=max_feature_num, embedding_vec_size=embedding_vec_size, combiner=combiner) self.dense_layers = [] for _ in range(self.num_dense_layers - 1): self.dense_layers.append(tf.keras.layers.Dense(units=1024, activation='relu')) self.out_layer = tf.keras.layers.Dense(units=1, activation='sigmoid', use_bias=True, kernel_initializer='glorot_normal', bias_initializer='glorot_normal')
def _v2_fprop_v1_test(): print("[INFO]: Testing plugin_v2 fprop_experimental vs tf..") if vocabulary_size < slot_num: raise ValueError("vocabulary_size must > slot_num.") # generate initial values init_value, input_keys = generate_embedding_init_value_and_inputs() # -------------------------------- hugectr ops ------------------------------------ # class TestModel(tf.keras.models.Model): def __init__(self, init_value, name_, embedding_type, optimizer_type, max_vocabulary_size_per_gpu, opt_hparams, update_type, atomic_update, scaler, slot_num, max_nnz, max_feature_num, embedding_vec_size, combiner): super(TestModel, self).__init__() self.input_buffer_reset = True if "distributed" == embedding_type else False self.embedding_name = hugectr_tf_ops_v2.create_embedding( init_value=init_value, name_=name_, embedding_type=embedding_type, optimizer_type=optimizer_type, max_vocabulary_size_per_gpu=max_vocabulary_size_per_gpu, opt_hparams=opt_hparams, update_type=update_type, atomic_update=atomic_update, scaler=scaler, slot_num=slot_num, max_nnz=max_nnz, max_feature_num=max_feature_num, embedding_vec_size=embedding_vec_size, combiner=combiner) def build(self, _): self.bp_trigger = self.add_weight(name="bp_trigger", shape=(1, ), dtype=tf.float32, trainable=True) @tf.function def call(self, row_offset, values, nnz, training=True): replica_ctx = tf.distribute.get_replica_context() result = hugectr_tf_ops_v2.fprop_experimental( self.embedding_name, replica_ctx.replica_id_in_sync_group, row_offset, values, nnz, self.bp_trigger, input_buffer_reset=self.input_buffer_reset) return result hugectr_tf_ops_v2.init(visible_gpus=gpus, seed=0, key_type='int64', value_type='float', batch_size=batch_size, batch_size_eval=len(gpus)) strategy = tf.distribute.MirroredStrategy( devices=['/GPU:' + str(i) for i in gpus]) with strategy.scope(): hugectr_model = TestModel( init_value=init_value, name_='test_embedding', embedding_type=embedding_type, optimizer_type='Adam', max_vocabulary_size_per_gpu=(vocabulary_size // len(gpus)) * 2 + 1, opt_hparams=[0.1, 0.9, 0.99, 1e-5], update_type='Global', atomic_update=True, scaler=1.0, slot_num=slot_num, max_nnz=max_nnz, max_feature_num=slot_num * max_nnz, embedding_vec_size=embedding_vec_size, combiner='sum') opt = tf.keras.optimizers.Adam(learning_rate=0.1, beta_1=0.9, beta_2=0.99, epsilon=1e-5) # preprocess inputs dataset_utils = CreateDataset(dataset_names=None, feature_desc=None, batch_size=batch_size, n_epochs=None, slot_num=slot_num, max_nnz=max_nnz, convert_to_csr=None, gpu_count=len(gpus), embedding_type=embedding_type, get_row_indices=None) if "distributed" == embedding_type: row_offsets, value_tensors, nnz_array = dataset_utils._distribute_keys_for_distributed( input_keys) elif "localized" == embedding_type: row_offsets, value_tensors, nnz_array = dataset_utils._distribute_keys_for_localized( input_keys) else: raise ValueError("Not supported embedding_type %s." % embedding_type) # forward function @tf.function def hugectr_train_step(row_offset, values, nnz): with tf.GradientTape() as tape: forward_result = hugectr_model(row_offset, values, nnz) grads = tape.gradient(forward_result, hugectr_model.trainable_weights) opt.apply_gradients(zip(grads, hugectr_model.trainable_weights)) return forward_result # -------------------------------- tf ops ------------------------------------------- # reshape_input_keys = np.reshape(input_keys, [-1, max_nnz]) tf_indices = tf.where(reshape_input_keys != -1) tf_values = tf.gather_nd(reshape_input_keys, tf_indices) sparse_tensor = tf.sparse.SparseTensor(tf_indices, tf_values, reshape_input_keys.shape) tf_embedding_layer = OriginalEmbedding( vocabulary_size=vocabulary_size, embedding_vec_size=embedding_vec_size, initializer=init_value, combiner='sum', gpus=gpus) tf_opt = tf.keras.optimizers.Adam(learning_rate=0.1, beta_1=0.9, beta_2=0.99, epsilon=1e-5) @tf.function def tf_train_step(sparse_tensor): with tf.GradientTape() as tape: tf_forward = tf_embedding_layer( sparse_tensor, output_shape=[batch_size, slot_num, embedding_vec_size]) grads = tape.gradient(tf_forward, tf_embedding_layer.trainable_weights) tf_opt.apply_gradients( zip(grads, tf_embedding_layer.trainable_weights)) return tf_forward # ------------------ comparison ---------------------------------------------------- # for iteration in range(2): replica_row_offsets = PerReplica(row_offsets) replica_values = PerReplica(value_tensors) replica_nnz = PerReplica(nnz_array) hugectr_forward = strategy.run(hugectr_train_step, args=(replica_row_offsets, replica_values, replica_nnz)) if len(gpus) > 1: hugectr_forward = tf.concat(hugectr_forward.values, axis=0) tf_forward = tf_train_step(sparse_tensor) try: tf.debugging.assert_near(hugectr_forward, tf_forward, rtol=1e-4, atol=1e-5) except tf.errors.InvalidArgumentError as error: raise error else: print( "[INFO]: The results from HugeCTR and tf in %d iteration are the same" % (iteration + 1)) # --------------------- release resources -------------------------------------- # hugectr_tf_ops_v2.reset()
def __init__(self, batch_size, gpus, init_value, name_, embedding_type, optimizer_type, max_vocabulary_size_per_gpu, opt_hparams, update_type, atomic_update, scaler, slot_num, max_nnz, max_feature_num, embedding_vec_size, combiner, num_dense_layers, input_buffer_reset=False): super(PluginSparseModel, self).__init__() self.num_dense_layers = num_dense_layers self.input_buffer_reset = input_buffer_reset self.batch_size = batch_size self.slot_num = slot_num self.embedding_vec_size = embedding_vec_size self.gpus = gpus # Make use init() only be called once. It will create resource manager for embedding_plugin. hugectr_tf_ops_v2.init(visible_gpus=gpus, seed=0, key_type='int64', value_type='float', batch_size=batch_size, batch_size_eval=len(gpus)) # create one embedding layer, and its embedding_name will be unique if there are more than one embedding layer. self.embedding_name = hugectr_tf_ops_v2.create_embedding( init_value=init_value, name_=name_, embedding_type=embedding_type, optimizer_type=optimizer_type, max_vocabulary_size_per_gpu=max_vocabulary_size_per_gpu, opt_hparams=opt_hparams, update_type=update_type, atomic_update=atomic_update, scaler=scaler, slot_num=slot_num, max_nnz=max_nnz, max_feature_num=max_feature_num, embedding_vec_size=embedding_vec_size, combiner=combiner) # define other parts of this DNN model self.dense_layers = [] for _ in range(self.num_dense_layers - 1): self.dense_layers.append( tf.keras.layers.Dense(units=1024, activation='relu')) self.out_layer = tf.keras.layers.Dense( units=1, activation='sigmoid', use_bias=True, kernel_initializer='glorot_normal', bias_initializer='glorot_normal')