def quantize_post(x, name, tag): if tag != "none": if mpi_rank == 0: qspec_f = QuantizeSpec(copy=qspec_e6f7, logfile="qspec_e6f07.f.%s.txt" % tag) qspec_b = QuantizeSpec(copy=qspec_e5f2, logfile="qspec_e5f02.b.%s.txt" % tag) else: qspec_f = qspec_e6f7 qspec_b = qspec_e5f2 return quantize(x, qspec_f, qspec_b, name=name) return x
def quantize_pre(x, name, tag): if tag != "none": if mpi_rank == 0: qspec_f = QuantizeSpec(copy=qspec_e4f3, logfile="qspec_e4f03.f.%s.txt" % tag) qspec_b = QuantizeSpec(copy=qspec_e6f7, logfile="qspec_e6f07.b.%s.txt" % tag) else: qspec_f = qspec_e4f3 qspec_b = qspec_e6f7 return quantize(x, qspec_f, qspec_b, name=name) return x
def model(X, Y, hps): # tf Variable of random ints of size (3 * GPU_SMs * 1024) # tf doesn't support int32 variables? Hack with float32 view. entropy_init = np.random.randint(-(1<<31), (1<<31), size=80*3*1024, dtype=np.int32).view(np.float32) if hps.tag != "none": qspec_e4f11 = QuantizeSpec( ebits = 4, fbits = 11, stochastic = 2, denorm = True, frequency = 512, bias_pad = 1, logfile="qspec_e4f11.%s.b.txt" % hps.tag, ) qspec_e5f10 = QuantizeSpec( ebits = 5, fbits = 10, stochastic = 2, denorm = True, frequency = 512, bias_pad = 4, logfile="qspec_e5f10.%s.b.txt" % hps.tag, ) else: qspec_e4f11 = None qspec_e5f10 = None xs = tf.split(X, mpi_size, 0) ys = tf.split(Y, mpi_size, 0) with tf.device("/gpu:0"), tf.variable_scope("model"): entropy = tf.get_variable("entropy", initializer=entropy_init, trainable=False) set_entropy(entropy) h = embed_input(xs[mpi_rank], hps) for l in range(hps.n_layer): h = transformer_block(h, 'layer_%d' % l, hps.n_head) logits = output(h, hps) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=ys[mpi_rank]) loss = tf.reduce_mean(loss) params = tf.trainable_variables() grads = tf.gradients(loss, params) # for p in params: # print(p.op.name + "_" + "_".join(str(x) for x in p.shape.as_list())) test = tf.reduce_sum(tf.cast(tf.equal(tf.cast(tf.argmax(logits, 1), tf.int32), ys[mpi_rank]), tf.float32)) grad_scale = 1.0 / mpi_size # all reduce grads if mpi_size > 1: group_allreduce(grads, params, search_strings=["classifier"] + ["layer_%d" % l for l in range(hps.n_layer-1, -1, -1)]) loss = allreduce(loss) * grad_scale test = allreduce(test) train = Adam(grads, params, grad_scale=grad_scale, param_qspec=qspec_e4f11, mean_qspec=qspec_e5f10, var_qspec=qspec_e5f10) return loss, train, test
from tqdm import tqdm from mpi4py import MPI from tensorflow.examples.tutorials.mnist import input_data from blocksparse.transformer import transpose_0213, masked_softmax from blocksparse.norms import layer_norm from blocksparse.optimize import Adam from blocksparse.embed import embedding_lookup from blocksparse.quantize import QuantizeSpec, quantize, set_entropy from blocksparse.ewops import bias_relu from blocksparse.nccl import allreduce, group_allreduce, sync_variables_op qspec_e4f3 = QuantizeSpec( ebits = 4, fbits = 3, denorm = True, frequency = 512, bias_pad = 1, ) qspec_e5f2 = QuantizeSpec( ebits = 5, fbits = 2, stochastic = 2, denorm = True, frequency = 512, bias_pad = 8, ) qspec_e6f7 = QuantizeSpec( ebits = 6, fbits = 7, stochastic = 0,