def allreduce(val): if mpi_size() == 1: return val return nccl.allreduce(val, num_comms=num_comms(), prereduce=prereduce_size())
def model(xs, ys): with tf.variable_scope("model"): with tf.device("/cpu:0"): global_step = tf.Variable(1.0, trainable=False) learning_rate = tf.minimum( global_step * tf.constant(1.0 / hps.warmup_iters), 1.0) * tf.constant(hps.lr) with tf.device("/gpu:0"): # embed discrete inputs to continous space and add learned position embeddings with tf.variable_scope('embed'): x_embed = fp16( tf.get_variable( "x", [hps.n_vocab, hps.n_state], initializer=tf.random_normal_initializer(stddev=0.02))) pos_embed = fp16( tf.get_variable( 'pos', [1, hps.n_timesteps, hps.n_state], initializer=tf.random_normal_initializer(stddev=0.01))) h = embedding_lookup(x_embed, xs) + pos_embed for l in range(hps.n_layer): h = transformer_block(h, 'layer_%d' % l, hps.n_head, hps.n_timesteps) #average pool transformer features and apply linear classifier with tf.variable_scope('logits'): h = tf.reshape(h, [-1, hps.n_state]) logits = tf.matmul(h, x_embed, transpose_b=True) labels = tf.reshape(ys, [-1]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=fp32(logits), labels=tf.cast(labels, tf.int32)) loss = tf.reduce_mean(loss) params = tf.trainable_variables() grads = tf.gradients(loss * hps.cost_scale, params) mpi_scale = 1.0 / mpi_size if mpi_size > 1: loss = allreduce(loss) * mpi_scale group_allreduce(grads, params, search_strings=[ "layer_%d" % l for l in range(hps.n_layer - 1, -1, -1) ] + ["embed"], cast_all=tf.float16) # for tuning fp16 cost scaling if hps.log_stats and mpi_rank == 0: for i, (grad, param) in enumerate(zip(grads, params)): name = param.op.name + "_" + "_".join( str(x) for x in param.shape.as_list()) grads[i] = log_stats(grad, tf.cast(global_step, tf.int32), logfile="scale_stats.txt", name=name) # use adafactor for most params and adam for embeddings fact_grads = list() adam_grads = list() for grad, param in zip(grads, params): if "embed" in param.op.name: # for input embedding, only update param + running stats when embedding vector was selected by input # more stable learning for rarely used embedding entries # if "x" in param.op.name: # grad.lazy = True adam_grads.append((grad, param)) else: fact_grads.append((grad, param)) fact = AdafactorOptimizer(learning_rate=learning_rate, grad_scale=mpi_scale / hps.cost_scale, sat_infs=True) adam = AdamOptimizer(learning_rate=learning_rate, grad_scale=mpi_scale / hps.cost_scale, sat_infs=True) train_op = tf.group(fact.apply_gradients(fact_grads), adam.apply_gradients(adam_grads)) # update global step after we're done using it for this update with tf.control_dependencies([train_op]), tf.device("/cpu:0"): update_op = tf.assign_add(global_step, 1.0) return loss, tf.group(train_op, update_op)
def model(X, Y, hps): # tf Variable of random ints of size (3 * GPU_SMs * 1024) # tf doesn't support int32 variables? Hack with float32 view. entropy_init = np.random.randint(-(1<<31), (1<<31), size=80*3*1024, dtype=np.int32).view(np.float32) if hps.tag != "none": qspec_e4f11 = QuantizeSpec( ebits = 4, fbits = 11, stochastic = 2, denorm = True, frequency = 512, bias_pad = 1, logfile="qspec_e4f11.%s.b.txt" % hps.tag, ) qspec_e5f10 = QuantizeSpec( ebits = 5, fbits = 10, stochastic = 2, denorm = True, frequency = 512, bias_pad = 4, logfile="qspec_e5f10.%s.b.txt" % hps.tag, ) else: qspec_e4f11 = None qspec_e5f10 = None xs = tf.split(X, mpi_size, 0) ys = tf.split(Y, mpi_size, 0) with tf.device("/gpu:0"), tf.variable_scope("model"): entropy = tf.get_variable("entropy", initializer=entropy_init, trainable=False) set_entropy(entropy) h = embed_input(xs[mpi_rank], hps) for l in range(hps.n_layer): h = transformer_block(h, 'layer_%d' % l, hps.n_head) logits = output(h, hps) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=ys[mpi_rank]) loss = tf.reduce_mean(loss) params = tf.trainable_variables() grads = tf.gradients(loss, params) # for p in params: # print(p.op.name + "_" + "_".join(str(x) for x in p.shape.as_list())) test = tf.reduce_sum(tf.cast(tf.equal(tf.cast(tf.argmax(logits, 1), tf.int32), ys[mpi_rank]), tf.float32)) grad_scale = 1.0 / mpi_size # all reduce grads if mpi_size > 1: group_allreduce(grads, params, search_strings=["classifier"] + ["layer_%d" % l for l in range(hps.n_layer-1, -1, -1)]) loss = allreduce(loss) * grad_scale test = allreduce(test) train = Adam(grads, params, grad_scale=grad_scale, param_qspec=qspec_e4f11, mean_qspec=qspec_e5f10, var_qspec=qspec_e5f10) return loss, train, test
def model(xs, ys, cost_scale, grad_scale): with tf.variable_scope("model"): with tf.device("/cpu:0"): global_step = tf.Variable(1.0, trainable=False) learning_rate = tf.minimum( global_step * tf.constant(1.0 / hps.warmup_iters), tf.constant(1.0)) * tf.constant(hps.lr) with tf.device("/gpu:0"): # Contains scope/var_name substrings we use to group gradients for all reduce # You'll want to find groupings that are scheduled uniquely by tensorflow, otherwise allreduce could hang. # The groups should be ordered in which the all-reduce is called. # Any gradients not matching the substrings will get appended to the last group. grad_groups = [] # embed discrete inputs to continous space and add learned position embeddings with tf.variable_scope('embed'): x_embed = fp16( tf.get_variable( "x", [hps.n_vocab, hps.n_state], initializer=tf.random_normal_initializer(stddev=0.02))) pos_embed = fp16( tf.get_variable( 'pos', [1, hps.n_timesteps, hps.n_state], initializer=tf.random_normal_initializer(stddev=0.01))) h = embedding_lookup(x_embed, xs) + pos_embed grad_groups.insert(0, 'embed') for l in range(hps.n_layer): layer_name = 'layer_%d' % l h = transformer_block(h, layer_name, hps.n_head, hps.n_timesteps) grad_groups.insert(0, layer_name) #average pool transformer features and apply linear classifier with tf.variable_scope('logits'): h = tf.reshape(h, [-1, hps.n_state]) logits = tf.matmul(h, x_embed, transpose_b=True) # labels = tf.reshape(ys, [-1]) # loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=fp32(logits), labels=tf.cast(labels, tf.int32)) loss = softmax_cross_entropy(logits=logits, labels=ys) loss = tf.reduce_mean(loss) params = tf.trainable_variables() # use scale_tensor so we can keep the cost_scale a host side placeholder grads = tf.gradients(scale_tensor(loss, cost_scale), params) if mpi_size > 1: loss = allreduce(loss) * tf.constant(1.0 / mpi_size) group_allreduce(grads, params, search_strings=grad_groups) global_norm, norm_scale = ClipGlobalNorm(grads, grad_scale=grad_scale, clip_norm=hps.clip_norm) # for tuning fp16 cost scaling if hps.log_stats and mpi_rank == 0: for i, (grad, param) in enumerate(zip(grads, params)): name = param.op.name + "_" + "_".join( str(x) for x in param.shape.as_list()) grads[i] = log_stats(grad, tf.cast(global_step, tf.int32), logfile="scale_stats.txt", name=name) # use adafactor for most params and adam for embeddings fact_grads = list() adam_grads = list() for grad, param in zip(grads, params): if "embed" in param.op.name: # for input embedding, only update param + running stats when embedding vector was selected by input # more stable learning for rarely used embedding entries # Note that we use the x_embed as the output logits projection, so there's little value to using lazy here. # if "x" in param.op.name: # grad.lazy = True adam_grads.append((grad, param)) else: fact_grads.append((grad, param)) fact = AdafactorOptimizer(learning_rate=learning_rate, norm_scale=norm_scale, grad_scale=grad_scale) adam = AdamOptimizer(learning_rate=learning_rate, norm_scale=norm_scale, grad_scale=grad_scale) train_op = tf.group(fact.apply_gradients(fact_grads), adam.apply_gradients(adam_grads)) # update global step after we're done using it for this update with tf.control_dependencies([train_op]), tf.device("/cpu:0"): update_op = tf.assign_add(global_step, 1.0) return loss, tf.group(train_op, update_op), global_norm, norm_scale