def testBiasRelu(self): config = tf.ConfigProto( intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) with self.test_session(config=config) as sess: for shape in shapes: # shape[0] //= 24 # shape[0] *= 512 if ones: cpuX = np.ones(shape, dtype=p.float32) cpuE = np.ones(shape, dtype=p.float32) cpuB = np.ones(shape[1:], dtype=p.float32) else: cpuX = np.random.uniform(-1.0, 1.0, shape).astype(np.float16).astype(np.float32) cpuE = np.random.uniform(-1.0, 1.0, shape).astype(np.float16).astype(np.float32) cpuB = np.random.uniform(-1.0, 1.0, shape[1:]).astype(np.float32) for relu in (True, False): for dtype in (tf.float32, ): #tf.float16, tf.bfloat16 results = [] for device in ("gpu", "cpu"): if bench and device == "cpu": break cast = device == "gpu" and dtype is not tf.float32 with tf.device("/%s:0" % device), tf.name_scope(device): x = tf.placeholder(tf.float32, cpuX.shape) e = tf.placeholder(tf.float32, cpuE.shape) b = tf.placeholder(tf.float32, cpuB.shape) feed_dict = { x: cpuX, e: cpuE, b:cpuB } xc = ew.float_cast(x, dtype=dtype) if cast else x y = ew.bias_relu(xc, b, relu=relu, atomics=atomics, bench=bench) if cast: y = ew.float_cast(y, dtype=tf.float32) dx, db = tf.gradients(y, [x, b], e) results.append( sess.run( [ y, dx, db ], feed_dict ) ) if not bench: for op, dev, cpu in zip(["y", "dx", "db"], results[0], results[1]): dif = np.abs(cpu - dev) avgval = np.average(abs(cpu)) maxdif = dif.max() max_err = maxdif if avgval == 0 else maxdif / avgval l2_err = np.sqrt(np.square(dif).sum()) / np.sqrt(np.square(cpu).sum()) print("%s, shape:%14s, op:%3s(%d), err:%17.12f, l2_err:%17.12f" % (dtype.name, str(cpu.shape), op, relu, maxdif, l2_err))
def group_allreduce(grads, parms, search_strings=None, cast_map=None, cast_all=None, num_comms=2, prereduce=0): # if no grouping specified, create one group to reduce at the end (no overlap with compute) if search_strings is None: search_strings = ["group_allreduce_all"] groups = [(name, list(), list()) for name in search_strings] for i, (grad, param) in enumerate(zip(grads, parms)): for name, group16, group32 in groups: if name == search_strings[-1] or name in param.name: if cast_all is not None: grad = float_cast(grad, dtype=cast_all) elif cast_map is not None and name in cast_map: grad = float_cast(grad, dtype=cast_map[name]) if grad.dtype.base_dtype is tf.float16: group16.append((i, grad, param)) else: group32.append((i, grad, param)) break for name, group16, group32 in groups: count = 0 for group in (group16, group32): count += len(group) if len(group) > 0: if len(group) == 1: concated = group[0][1] else: concated = tf.concat( [tf.reshape(grad, [-1]) for _, grad, _ in group], 0, name="concat_" + name) reduced = allreduce(concated, num_comms=num_comms, prereduce=prereduce) if len(group) == 1: grads[group[0][0]] = reduced else: offset = 0 for i, grad, param in group: size = param.shape.num_elements() grads[i] = tf.reshape(reduced[offset:offset + size], param.shape) offset += size if count == 0: print("Warning: no grads found for all_reduce group: ", name)
def testBlocksparseSoftmax(self): with self.test_session(config=config) as sess, tf.device("/gpu:0"): for bsize in ( 8, 16, 32, 64, ): # 16, 32, 64 # define outer block structure for blocksparse matmul layout = np.ones([1, ctx, ctx], dtype=np.bool) for q, k in np.ndindex(ctx, ctx): if k > q: layout[:, q, k] = 0 #print(layout[0]) bst = trans.BlocksparseTransformer(layout, heads=heads, block_size=bsize, mask_callback=mask_callback) shape = (batch, heads, bst.blocks, bsize, bsize) if ones: cpuX = np.ones(shape, dtype=np.float32) cpuE = np.ones(shape, dtype=np.float32) else: cpuX = np.random.uniform(-1.0, 1.0, shape).astype( np.float16).astype(np.float32) cpuE = np.random.uniform(-1.0, 1.0, shape).astype( np.float16).astype(np.float32) x = tf.placeholder(tf.float32, cpuX.shape) e = tf.placeholder(tf.float32, cpuE.shape) feed_dict = {x: cpuX, e: cpuE} xf = ew.float_cast(x, dtype=tf.bfloat16) y = bst.masked_softmax(xf, scale=scale) y = ew.float_cast(y, dtype=tf.float32) dx, = tf.gradients(y, [x], e) y, dx = sess.run([y, dx], feed_dict) Y = bst.masked_softmax_test(cpuX, scale=scale) DX = bst.masked_softmax_grad_test(cpuE, Y, scale=scale) print("testBlocksparseSoftmax", bsize) for op, dev, cpu in [ [" Y", y, Y], ["DX", dx, DX], ]: self.compare_results(op, dev, cpu)
def testMatMul(self): config = tf.ConfigProto( intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) with self.test_session(config=config) as sess: for shape in shapes: np.random.seed(int(time())) cpuX = np.random.normal(loc=0.1, scale=1.0, size=shape).astype(np.float16).astype(np.float32) cpuE = np.random.normal(loc=0.2, scale=1.0, size=shape).astype(np.float16).astype(np.float32) cpuU = np.dot(cpuX.astype(np.float64).T, cpuE.astype(np.float64)).astype(np.float32) for dtype in (tf.float32, tf.float16): #tf.float16, tf.bfloat16 with tf.device("/gpu:0"): x = tf.placeholder(tf.float32, cpuX.shape, name="x") e = tf.placeholder(tf.float32, cpuE.shape, name="e") feed_dict = { x : cpuX, e : cpuE } if dtype is not tf.float32: xf = ew.float_cast(x, dtype=dtype) ef = ew.float_cast(e, dtype=dtype) else: xf, ef = x, e u0 = dw_matmul_large_n(xf, ef) u1 = tf.matmul(xf, ef, transpose_a=True, transpose_b=False) if dtype is not tf.float32: u1 = ew.float_cast(u1, dtype=tf.float32, dx_dtype=dtype) u0, u1 = sess.run( [ u0, u1 ], feed_dict ) for op, dev, cpu in [ ("custom", u0, cpuU), ("cublas", u1, cpuU), ]: dif = np.abs(cpu - dev) avgval = np.average(abs(cpu)) maxdif = dif.max() max_err = maxdif if avgval == 0 else maxdif / avgval l2_err = np.sqrt(np.square(dif).sum()) / np.sqrt(np.square(cpu).sum()) print("%s, depth:%8d shape:%12s, op:%s, err:%17.12f, l2_err:%17.12f" % (dtype.name, shape[0], str(cpu.shape), op, maxdif, l2_err))
def testBiasRelu(self): config = tf.ConfigProto( intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) with self.test_session(config=config) as sess: for shapeX in shapes: axis = len(shapeX)-2 shapeY = list(shapeX) shapeY[axis] = 1 np.random.seed(int(time())) cpuX = np.random.uniform(-2**14, 2**14, shapeX).astype(np.float16).astype(np.float32) cpuE = np.random.uniform(-2**14, 2**14, shapeY).astype(np.float16).astype(np.float32) for dtype in (tf.float16, ): #tf.float16, tf.float32 results = [] for device in ("gpu", "cpu"): cast = device == "gpu" and dtype is not tf.float32 with tf.device("/%s:0" % device), tf.name_scope(device): x = tf.placeholder(tf.float32, cpuX.shape, name="x") e = tf.placeholder(tf.float32, cpuE.shape, name="e") feed_dict = { x : cpuX, e : cpuE } xf = ew.float_cast(x, dtype=dtype) if cast else x y = ew.reduce_max(xf, axis=axis, keepdims=True) if cast: y = ew.float_cast(y, dtype=tf.float32) dx, = tf.gradients(y, [x], e) results.append( sess.run( [ y, dx ], feed_dict ) ) for op, dev, cpu in zip(["y", "dx"], results[0], results[1]): dif = np.abs(cpu - dev) sum_err = (dif > .01).sum() pct_err = 100*sum_err / cpu.size l2_err = np.sqrt(np.square(dif).sum()) / np.sqrt(np.square(cpu).sum()) print("%s, shape:%22s, op:%3s, sum_err: %4d, pct_err: %.4f, l2_err:%17.12f" % (dtype.name, str(cpu.shape), op, sum_err, pct_err, l2_err))
def grouped_lstm(inputs, width, timesteps, initial_state, scope="grouped_lstm", reuse=None, lstm_id=0, layernorm=True): fp16 = inputs.dtype is tf.float16 if layernorm: from blocksparse.norms import layer_norm if fp16: from blocksparse.ewops import float_cast in_width = inputs.shape[-1].value with tf.variable_scope(scope, reuse=reuse): w = tf.get_variable('kernel', shape=[in_width + width, 4 * width]) b = tf.get_variable('bias', shape=[4 * width]) if layernorm: g = tf.get_variable('gain', shape=[4 * width]) c, h = initial_state if fp16: w = float_cast(w, dtype=tf.float16, dx_dtype=tf.float16) if timesteps > 1: inputs = [ tf.squeeze(x) for x in tf.split(inputs, timesteps, axis=1) ] else: inputs = [tf.reshape(inputs, [-1, inputs.shape[-1].value])] outputs = [] for t, x in enumerate(inputs): h = tf.matmul(tf.concat([x, h], 1), w, name="lstm_%02d/step_%04d" % (lstm_id, t)) if layernorm: h = layer_norm(h, g, b, axis=1, segments=4) c, h = fused_lstm_gates(c, h, forget_bias=1.0) else: c, h = fused_lstm_gates(c, h, bias=b, forget_bias=1.0) outputs.append(h) output = tf.stack(outputs, axis=1) return output, [c, h]
def forward(self, inputs, ema=None): hps = self.hps bsmm = hps.bsmm xgroup = hps.x_group_size xgroups = len(inputs) // xgroup sproj = hps.sproj_out self.inputs = inputs if sproj is not None: inputs = [sproj.gather(h) for h in inputs] with tf.variable_scope(self.scope): w = hps.get_variable("w", bsmm["y"].w_shape, normal_initializer()) g = hps.get_variable("g", [hps.nvocab], ones_initializer()) b = hps.get_variable("b", [hps.nvocab], zeros_initializer()) self.params = [w, g, b] if ema is not None: w = ema.average(w) g = ema.average(g) b = ema.average(b) #w = ew.float_cast(w, dtype=hps.dtype) w = bsmm["y"].l2_normalize(w, dtype=hps.dtype) # compute the fc matmul in groups for better memory efficiency. ygroups = [] for i in range(xgroups): x = tf.concat(inputs[i * xgroup:(i + 1) * xgroup], 1 - hps.axis) # (nsteps x nbatch, nvocab) = (nsteps x nbatch, hidden) . (nhidden, nvocab) ygroups.append(bsmm["y"](x, w, dw_dtype=hps.dw_dtype)) y = tf.concat(ygroups, 1 - hps.axis) # cast to float32 before entering cost function y = ew.float_cast(y, dtype=tf.float32, dx_dtype=hps.dx_dtype) if hps.axis == 0: y = tf.transpose(y) if (hps.nvocab % 32) != 0: y = tf.slice(y, [0, 0], [-1, hps.nvocab]) self.outputs = y * g + b outputs = tf.stop_gradient(self.outputs) return outputs
def print_act_stats(x, _str="", flatten=False): if False: return x _x = ew.float_cast(x, dtype=tf.float32) if flatten: _x = tf.reshape(_x, [-1]) if len(_x.get_shape()) == 1: x_mean, x_var = tf.nn.moments(_x, [0], keep_dims=True) if len(_x.get_shape()) == 2: x_mean, x_var = tf.nn.moments(_x, [0], keep_dims=True) if len(_x.get_shape()) == 4: x_mean, x_var = tf.nn.moments(_x, [0, 2, 3], keep_dims=True) stats = [tf.reduce_min(x_mean), tf.reduce_mean(x_mean), tf.reduce_max(x_mean),\ tf.reduce_min(tf.sqrt(x_var)), tf.reduce_mean(tf.sqrt(x_var)), tf.reduce_max(tf.sqrt(x_var))] __str = "[" + _str + "] " + x.name print(__str) return tf.Print(x, stats, __str)
def forward(self, x, ema=None): hps = self.hps assert hps.nsteps % hps.x_group_size == 0 xgroups = hps.nsteps // hps.x_group_size with tf.variable_scope(self.scope): w = hps.get_variable("w", [hps.nvocab, hps.nembd], ortho_initializer()) g = hps.get_variable("g", [hps.nvocab, 1], ones_initializer()) self.params = [w, g] if ema is not None: w = ema.average(w) g = ema.average(g) w = tf.nn.l2_normalize(w, dim=1) * g # x (nsteps, nbatch) # w (nvocab, nembd) # o (nsteps, nbatch, nembd) words = tf.nn.embedding_lookup(w, x) if self.train and hps.dropout > 0 and hps.dropout_input > 0: words = tf.nn.dropout(words, 1. - hps.dropout, [hps.nsteps, hps.batch_size, 1]) # potentially down cast to fp16 to save memory and speed things up #words = ew.float_cast(words, dtype=hps.dtype) # (x_group_size x nbatch, nembd) * xgroups outputs = [ tf.reshape(x, [-1, hps.nembd]) for x in tf.split(words, xgroups, 0) ] if hps.axis == 0: outputs = [tf.transpose(x) for x in outputs] self.outputs = [ew.float_cast(x, dtype=hps.dtype) for x in outputs] outputs = [tf.stop_gradient(x) for x in self.outputs] return outputs
def apply(self, params, qspec=None): with tf.device("/gpu:0"), tf.control_dependencies(None): for param in params: if self.fp16 == 2 or (self.fp16 and is_param_casted(param)): # only use fp16 for params that are explicitly cast to fp16 before use init = float_cast(param.initialized_value(), dtype=tf.float16) dtype = tf.float16 else: init = param.initialized_value() dtype = tf.float32 with tf.variable_scope(None, param.op.name + "/" + self.name): # use the Identity read op output as the key # this lets us lookup ema vars by Cast op outputs self.averages[param.value()] = tf.get_variable( "ema", dtype=dtype, initializer=init, trainable=False) ops.add_to_collection(ops.GraphKeys.MOVING_AVERAGE_VARIABLES, param) ema_ops = [] for param in params: ema = self.averages[param.value()] gate = getattr(param, "gate", None) gate = [gate] if self.gated and gate is not None else [] op = ema_op(ema, param, gate, decay=self.decay) if qspec is not None: ema_ops.append( ema.assign(quantize(op, qspec, name="ema_" + param.op.name))) else: ema_ops.append(op) return tf.group(*ema_ops)
def forward(self, inputs, states, ema=None): hps = self.hps bsmm = hps.bsmm with tf.variable_scope(self.scope) as scope: self.param_names = ['xi', 'xf', 'xo', 'xu', 'hi', 'hf', 'ho', 'hu'] self.params = dict() for p in self.param_names: if 'x' in p: bsmm_p, size = (bsmm.x, hps.nproj_in) elif 'h' in p: bsmm_p, size = (bsmm.h, hps.nhidden) b_init = ones_initializer( hps.forget_bias) if p == 'hf' else zeros_initializer() w = hps.get_variable("w_" + p, bsmm_p.w_shape, bsmm_p.identity_init()) g = hps.get_variable("g_" + p, [size], ones_initializer()) b = hps.get_variable("b_" + p, [size], b_init) if ema is not None: w = ema.average(w) g = ema.average(g) b = ema.average(b) wc = ew.float_cast(w, dtype=hps.dtype) self.params[p] = (wc, g, b, w) c, h = tf.unstack(states, num=2) c = ew.float_cast(c, dtype=hps.dtype) h = ew.float_cast(h, dtype=hps.dtype) xi_w, xi_g, xi_b = self.params["xi"][0:3] xf_w, xf_g, xf_b = self.params["xf"][0:3] xo_w, xo_g, xo_b = self.params["xo"][0:3] xu_w, xu_g, xu_b = self.params["xu"][0:3] self.inputs = inputs self.outputs = [] self.segments = [] for xgroup in inputs: if hps.recompute and self.train: # We compute gradient one segment at a time, so prevent tf.gradients from going too far. # We also want to add control inputs to the start of the segment so having wrappers # around the segment inputs is handy. seg = [(tf.stop_gradient(c), tf.stop_gradient(h))] self.segments.append(seg) # delay input expansion to just prior to use (saves memory) with tf.control_dependencies([h]): xwi = bsmm.x(xgroup, xi_w, dw_dtype=hps.dw_dtype) xwf = bsmm.x(xgroup, xf_w, dw_dtype=hps.dw_dtype) xwo = bsmm.x(xgroup, xo_w, dw_dtype=hps.dw_dtype) xwu = bsmm.x(xgroup, xu_w, dw_dtype=hps.dw_dtype) xwi = tf.split(xwi, hps.x_group_size, 1 - hps.axis) xwf = tf.split(xwf, hps.x_group_size, 1 - hps.axis) xwo = tf.split(xwo, hps.x_group_size, 1 - hps.axis) xwu = tf.split(xwu, hps.x_group_size, 1 - hps.axis) masks = [] for xi, xf, xo, xu in zip(xwi, xwf, xwo, xwu): xi = layer_norm(xi, xi_g, xi_b, axis=hps.axis) xf = layer_norm(xf, xf_g, xf_b, axis=hps.axis) xo = layer_norm(xo, xo_g, xo_b, axis=hps.axis) xu = layer_norm(xu, xu_g, xu_b, axis=hps.axis) c, h, mask = self.cell(c, h, xi, xf, xo, xu) _masks = [mask] for _ in range(1, hps.lsteps): c, h, mask = self.cell(c, h, None, None, None, None) _masks.append(mask) masks.append(_masks) self.outputs.append(h) if hps.recompute and self.train: with tf.name_scope("f_seg_%04d_%d" % (len(self.segments) - 1, len(seg) - 1)): c_seg, h_seg = seg[0] with tf.control_dependencies([h_seg]): xwi = bsmm.x(xgroup, xi_w, dw_dtype=hps.dw_dtype) xwf = bsmm.x(xgroup, xf_w, dw_dtype=hps.dw_dtype) xwo = bsmm.x(xgroup, xo_w, dw_dtype=hps.dw_dtype) xwu = bsmm.x(xgroup, xu_w, dw_dtype=hps.dw_dtype) xwi = tf.split(xwi, hps.x_group_size, 1 - hps.axis) xwf = tf.split(xwf, hps.x_group_size, 1 - hps.axis) xwo = tf.split(xwo, hps.x_group_size, 1 - hps.axis) xwu = tf.split(xwu, hps.x_group_size, 1 - hps.axis) for xi, xf, xo, xu, mask in zip( xwi, xwf, xwo, xwu, masks): xi = layer_norm(xi, xi_g, xi_b, axis=hps.axis) xf = layer_norm(xf, xf_g, xf_b, axis=hps.axis) xo = layer_norm(xo, xo_g, xo_b, axis=hps.axis) xu = layer_norm(xu, xu_g, xu_b, axis=hps.axis) c_seg, h_seg, _ = self.cell( c_seg, h_seg, xi, xf, xo, xu, mask[0]) for i in range(1, hps.lsteps): c_seg, h_seg, _ = self.cell( c_seg, h_seg, None, None, None, None, mask[i]) seg.append((c_seg, h_seg)) c = ew.float_cast(c, dtype=tf.float32) h = ew.float_cast(h, dtype=tf.float32) states = tf.stack([c, h], 0) # We calculate the gradient internally. # Don't let other layer's gradients flow into here. # This is possible because the last cell has free c and h # params that are popluated with zeros in the gradients pass. outputs = [tf.stop_gradient(x) for x in self.outputs] return outputs, states
def testTopK(self): config = tf.ConfigProto( intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) with self.test_session(config=config) as sess, tf.device("/gpu:0"): for shape in shapes: topK = shape[-1] // 4 # 25% sparsity np.random.seed(int(time())) cpuX = np.random.uniform(-1.0, 1.0, shape).astype(np.float32) cpuE = np.random.uniform(-1.0, 1.0, shape).astype(np.float32) X = tf.placeholder(tf.float32, cpuX.shape) E = tf.placeholder(tf.float32, cpuE.shape) for mask_dims in (0, 2, 3): if mask_dims == 0: mask = M = m_shape = None feed_dict = { X: cpuX, E: cpuE } else: m_shape = [1 for n in shape] m_shape[-mask_dims:] = shape[-mask_dims:] mask = np.zeros(m_shape, dtype=np.float32) if mask_dims == 2: for y, x in np.ndindex(mask.shape[-2:]): if x <= y: mask[:,:,y,x] = 3.0 elif mask_dims == 3: for z, y, x in np.ndindex(mask.shape[-3:]): if x <= y: mask[:,z,y,x] = (z+1)*3.0 M = tf.placeholder(tf.float32, mask.shape) feed_dict = { X: cpuX, E: cpuE, M: mask } for dtype in (tf.float32, ): #tf.float16, tf.bfloat16 rtol = 1e-4 if dtype is tf.float32 else 1e-1 Y = ew.float_cast(X, dtype=dtype) #Y = trans.masked_top_k_softmax(Y, topK, mask=M, scale=2.0) Y = trans.masked_softmax(Y, mask=M, scale=2.0, bench=bench) Y = ew.float_cast(Y, dtype=tf.float32, dx_dtype=dtype) D = tf.gradients(Y, [X], E) #devY, = sess.run( [Y], feed_dict) devY, (devDX,) = sess.run( [Y, D], feed_dict) #devY, (devDX,), tfY = sess.run( [Y, D, tf.nn.top_k(X, topK)], feed_dict) # gradient_checker tests are insanely slow # if True: # x = tf.constant(cpuX) # m = tf.constant(mask) # y = trans.masked_top_k_softmax(x, topK, mask=m) # error = gradient_checker.compute_gradient_error(x, shape, y, shape) #, extra_feed_dict={ x: cpuX, m: mask } # assert error < 0.01, error if bench == 0: # cpuY = trans.masked_top_k_softmax_test(cpuX, topK, mask=mask, scale=2.0) # cpuDX = trans.masked_softmax_grad_test(cpuE, cpuY, mask=mask, scale=2.0) cpuY = trans.masked_softmax_test(cpuX, mask=mask, scale=2.0) cpuDX = trans.masked_softmax_grad_test(cpuE, cpuY, mask=mask, scale=2.0) difY = np.abs(cpuY - devY) difDX = np.abs(cpuDX - devDX) cntY = (difY > rtol).astype(np.int).sum() / difY.size cntDX = (difDX > rtol).astype(np.int).sum() / difDX.size print("%s, shape:%18s, mask:%18s, errY:%.5f, errDX:%.5f" % (dtype.name, str(shape), str(m_shape), cntY, cntDX)) if out: np.savetxt( "cpuY.txt", cpuY.reshape(-1,shape[-1]), fmt="%6.3f") np.savetxt( "devY.txt", devY.reshape(-1,shape[-1]), fmt="%6.3f") np.savetxt("cpuDX.txt", cpuDX.reshape(-1,shape[-1]), fmt="%6.3f") np.savetxt("devDX.txt", devDX.reshape(-1,shape[-1]), fmt="%6.3f") np.savetxt("difDX.txt", difDX.reshape(-1,shape[-1]), fmt="%6.3f")
def testCWiseLinear(self): config = tf.ConfigProto( intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) with self.test_session(config=config) as sess: for shape in (shapes): bshape = [1] * len(shape) bshape[1] = shape[1] if ones: cpuX = np.ones(shape, dtype=np.float32) cpuE = np.ones(shape, dtype=np.float32) cpuG = np.ones(bshape, dtype=np.float32) cpuB = np.ones(bshape, dtype=np.float32) else: np.random.seed(int(time())) cpuX = np.random.uniform(-1.0, 1.0, shape).astype(np.float32) cpuE = np.random.uniform(-1.0, 1.0, shape).astype(np.float32) cpuG = np.random.uniform(-1.0, 1.0, bshape).astype(np.float32) cpuB = np.random.uniform(-1.0, 1.0, bshape).astype(np.float32) for dtype in (tf.float32, tf.float16, ): # tf.float32, tf.float16, tf.bfloat16 relus = (True, False) if dtype is tf.float32 else (False,) for relu in relus: results = [] for device in ("gpu", "cpu"): cast = device == "gpu" and dtype is not tf.float32 with tf.device("/%s:0" % device), tf.name_scope(device): x = tf.placeholder(tf.float32, cpuX.shape, name="x") e = tf.placeholder(tf.float32, cpuE.shape, name="e") g = tf.placeholder(tf.float32, cpuG.shape, name="g") b = tf.placeholder(tf.float32, cpuB.shape, name="b") feed_dict = { x : cpuX, e : cpuE, g : cpuG, b : cpuB, } xf = float_cast(x, dtype=dtype) if cast else x y0 = cwise_linear(xf, gain=g, bias=b, relu=relu) y1 = cwise_linear(xf, gain=g, relu=relu) y2 = cwise_linear(xf, bias=b, relu=relu) if cast: y0 = float_cast(y0, dtype=tf.float32) y1 = float_cast(y1, dtype=tf.float32) y2 = float_cast(y2, dtype=tf.float32) dx0, dg0, db0 = tf.gradients(y0, [ x, g, b ], e) dx1, dg1 = tf.gradients(y1, [ x, g ], e) dx2, db2 = tf.gradients(y2, [ x, b ], e) results.append( sess.run( [ y0, y1, y2, dx0, dg0, db0, dx1, dg1, dx2, db2 ], feed_dict ) ) labels = ["y0", "y1", "y2", "dx0", "dg0", "db0", "dx1", "dg1", "dx2", "db2"] for op, dev, cpu in zip(labels, results[0], results[1]): dif = np.abs(cpu - dev) avgval = np.average(abs(cpu)) maxdif = dif.max() max_err = maxdif if avgval == 0 else maxdif / avgval l2_err = np.sqrt(np.square(dif).sum()) / np.sqrt(np.square(cpu).sum()) print("%s, shape:%16s, op: %3s, relu:%d, err:%17.12f, l2_err:%17.12f" % (dtype.name, str(cpu.shape), op, int(relu), max_err, l2_err))
def fp32(x): return float_cast(x, dtype=tf.float32)
def testEdgeBias(self): config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) with self.test_session(config=config) as sess, tf.device("/gpu:0"): test = 0 for N, K, RS, HW, strides in shapes: test += 1 PQ = [ceil_div(x, std) for x, std in zip(HW, strides)] for layout in ( "NCHW", "NHWC", ): # "NCHW","NHWC" if layout == "NHWC": y_shape = [N] + PQ + [K] x_shape = [N] + HW + [K] w_shape = RS + [K, K] else: y_shape = [N] + [K] + PQ x_shape = [N] + [K] + HW w_shape = [K, K] + RS eb = ConvEdgeBias(y_shape, x_shape, w_shape, strides=strides, data_format=layout) if ones: cpuX = np.ones(y_shape).astype(np.float32) cpuE = np.ones(y_shape).astype(np.float32) cpuG = np.ones(eb.shape).astype(np.float32) cpuB = np.ones(eb.shape).astype(np.float32) else: cpuX = np.random.uniform(-1.0, 1.0, y_shape).astype(np.float32) cpuE = np.random.uniform(-1.0, 1.0, y_shape).astype(np.float32) cpuG = np.random.uniform(-1.0, 1.0, eb.shape).astype(np.float32) cpuB = np.random.uniform(-1.0, 1.0, eb.shape).astype(np.float32) x = tf.placeholder(tf.float32, cpuX.shape) e = tf.placeholder(tf.float32, cpuE.shape) g = tf.placeholder(tf.float32, cpuG.shape) b = tf.placeholder(tf.float32, cpuB.shape) feed_dict = {x: cpuX, e: cpuE, g: cpuG, b: cpuB} for dtype in (tf.float32, ): # tf.float32, tf.float16, tf.bfloat16 xf = ew.float_cast(x, dtype=dtype) y = eb(xf, g, b, bench=bench) y = ew.float_cast(y, dtype=tf.float32, dx_dtype=dtype) devY, (devDX, devDG, devDB) = sess.run( [y, tf.gradients(y, [x, g, b], e)], feed_dict) if bench == 0: cpuY = eb.edge_bias_test(cpuX, cpuG, cpuB) cpuDX, cpuDG, cpuDB = eb.edge_bias_grad_test( cpuE, cpuX, cpuG) for op, devT, cpuT in ( (" devY", devY, cpuY), ("devDX", devDX, cpuDX), ("devDG", devDG, cpuDG), ("devDB", devDB, cpuDB), ): devT = np.array(devT) difA = cpuT - devT avgval = abs(cpuT).sum() / cpuT.size maxdif = abs(difA).max() ratio = maxdif / avgval print( "%8s, test:%2d layout: %s op:%s err:%17.12f" % (dtype.name, test, layout, op, ratio))
def atestBlocksparseMatMulGated(self): with self.test_session(config=conf) as sess, tf.device("/gpu:0"): N = 128 K = 8 * 56 * 2 * 4 n = K // 8 m = 30 dtype = tf.bfloat16 repeat = 10000 layout = networkx.generators.barabasi_albert_graph(n, m) layout = networkx.adjacency_matrix(layout).toarray().astype( np.int32) + np.eye(n, dtype=np.int32) layout[0:m, 0:m] = 1 blocks = layout.sum() n = layout.shape[0] print(100 * blocks / n**2) print(layout.sum(axis=0).max()) # layout = np.ones((112,32), dtype=np.int32) bsmm = BlocksparseMatMul(layout, block_size=8, feature_axis=0, name="test") if one: X = np.ones(bsmm.i_shape(N), dtype=np.float32) E = np.ones(bsmm.o_shape(N), dtype=np.float32) W = np.ones(bsmm.w_shape, dtype=np.float32) G = np.ones(bsmm.blocks, dtype=np.float32) else: X = np.random.uniform(-1.0, 1.0, bsmm.i_shape(N)).astype(np.float32) E = np.random.uniform(-1.0, 1.0, bsmm.o_shape(N)).astype(np.float32) W = np.random.uniform(-1.0, 1.0, bsmm.w_shape).astype(np.float32) G = np.random.uniform(0.0, 1.0, bsmm.blocks).astype(np.float32) G = np.ones(bsmm.blocks, dtype=np.float32) # for w, (c, k) in enumerate(bsmm.updat_list): # G[w] = (c & 1) ^ (k & 1) ^ 1 #G[::2] = 0.0 # block = dict() # for w, (c, k) in enumerate(bsmm.updat_list): # block[(c,k)] = w # grid = [] # for c in range(bsmm.CB): # row = [] # for k in range(bsmm.KB): # row.append(G[block[(c,k)]]) # grid.append(row) # for row in grid: # print(row) # exit() x = tf.constant(X) e = tf.constant(E) w = tf.constant(W) g = tf.constant(G) w2 = ew.float_cast(w, dtype=dtype) y = ew.float_cast(x, dtype=dtype) y = bsmm(y, w2, gate=g, bench=repeat) y = ew.float_cast(y, dtype=tf.float32, dx_dtype=dtype) d = tf.gradients(y, [x, w], e) y, (dx, dw) = sess.run([y, d]) # gpu kernel doesn't touch zero gate blocks # for b in range(bsmm.blocks): # if G[b] == 0.0: # dw[b,:,:] = 0.0 Y = bsmm.fprop_test(X, W, gate=G) DX = bsmm.bprop_test(E, W, gate=G) DW = bsmm.updat_test(X, E, gate=G) #print(Y.shape, dtype) for op, cpuA, devA in ( (" y:", Y, y), ("dx:", DX, dx), ("dw:", DW, dw), ): difA = abs(cpuA - devA) avgval = np.average(abs(cpuA)) maxdif = difA.max() max_err = maxdif if avgval == 0 else maxdif / avgval l2_err = np.sqrt(np.square(difA).sum()) / np.sqrt( np.square(cpuA).sum() + 1e-12) print("%s max_err%%:%11.8f L2_err: %12.10f" % (op, 100 * max_err, l2_err)) if out: dim = K if op == "dw:" else N np.savetxt("out.txt", difA.reshape((-1, dim)), fmt='%5.1f') np.savetxt("outC.txt", cpuA.reshape((-1, dim)), fmt='%5.1f') np.savetxt("outD.txt", devA.reshape((-1, dim)), fmt='%5.1f') exit()
def testBlocksparseMatMul(self): # layout = np.zeros((2,2), dtype=np.int32) # layout[0,0] = 1 n, m = 56 * 8, 8 layout = networkx.generators.barabasi_albert_graph(n, m) #layout = networkx.generators.random_graphs.watts_strogatz_graph(n, m*2, .5) layout = networkx.adjacency_matrix(layout).toarray().astype( np.int32) + np.eye(n, dtype=np.int32) layout[0:m, 0:m] = 1 #layout[0:60,0:60] = 1 #layout = np.zeros((4,4), dtype=np.int32) #layout = np.ones((28*12,28*12), dtype=np.int32) #layout[0,0] = 1 blocks = layout.sum() n = layout.shape[0] print(100 * blocks / n**2) print(layout.sum(axis=0).max()) #exit() with self.test_session(config=conf) as sess, tf.device("/gpu:0"): for bsize, axis in ( (32, 1), (32, 0), (16, 0), (8, 0), ): # (32,1), (32,0), (16,0), (8,0) bsmm = BlocksparseMatMul(layout, block_size=bsize, feature_axis=axis, name="test") if one: W = np.ones(bsmm.w_shape, dtype=np.float32) #W[:] += np.arange(8, dtype=np.float32).reshape(1,8) else: W = np.random.uniform(-1.0, 1.0, bsmm.w_shape).astype(np.float32) # WW = np.zeros((bsmm.C, bsmm.K), dtype=np.float32) # for w, (c, k) in enumerate(bsmm.updat_list): # WW[c*bsize:(c+1)*bsize, k*bsize:(k+1)*bsize] = W[w,:,:] w = tf.constant(W) # s1 = sess.run( bsmm.identity_init(gpu=True)(bsmm.w_shape) ) # s2 = bsmm.identity_init(gpu=False)(bsmm.w_shape) # print("identity_init: ", (s1 - s2).max()) for N in (64, ): # 128,64,32,16,1, if one: X = np.ones(bsmm.i_shape(N), dtype=np.float32) E = np.ones(bsmm.o_shape(N), dtype=np.float32) #X[:] += np.arange(8, dtype=np.float32).reshape(8,1) else: X = np.random.uniform( -1.0, 1.0, bsmm.i_shape(N)).astype(np.float32) E = np.random.uniform( -1.0, 1.0, bsmm.o_shape(N)).astype(np.float32) x = tf.constant(X) e = tf.constant(E) for dtF, dtB in dtypes: print("Axis:%d Bsize:%2d N:%d F:%s B:%s Params:%d" % (axis, bsize, N, dtF.name, dtB.name, bsize * bsize * blocks)) # compute in tensorflow if l2norm: w2 = bsmm.l2_normalize(w, dtype=dtF) else: w2 = ew.float_cast(w, dtype=dtF) y = ew.float_cast(x, dtype=dtF) for j in range(depth): repeat = bench if bench and j == depth - 1 else 0 y = bsmm( y, w2, dw_dtype=dtF, bench=repeat ) # (bench and j==depth-1) (bench and j==0) y = ew.float_cast(y, dtype=tf.float32, dx_dtype=dtB) if bench: sess.run(y) #y = sess.run( y ) d = tf.gradients(y, [x, w], e, aggregation_method=am) if depth > 1: d[1] = group_param_grads(d[1], 8) y, (dx, dw) = sess.run([y, d]) if not bench: # compute in numpy if l2norm: W2 = bsmm.l2_normalize_test(W) else: W2 = W # YY = np.dot(WW.T, X) # ZZ = np.dot(WW , E) # uu = np.dot( X , E.T) # UU = np.zeros(bsmm.w_shape, dtype=np.float32) # for w, (c, k) in enumerate(bsmm.updat_list): # UU[w,:,:] = uu[c*bsize:(c+1)*bsize, k*bsize:(k+1)*bsize] Ys = [X] for j in range(depth): Ys.append(bsmm.fprop_test(Ys[-1], W2)) Y = Ys.pop() DW = np.zeros(bsmm.w_shape, dtype=np.float32) DX = E for j in range(depth): DW += bsmm.updat_test(Ys.pop(), DX) DX = bsmm.bprop_test(DX, W2) if l2norm: DW = bsmm.l2_normalize_grad_test(W, DW) for op, cpuA, devA in ( # ("YY:", YY, y), # ("ZZ:", ZZ, dx), # ("UU:", UU, dw), (" y:", Y, y), ("dx:", DX, dx), ("dw:", DW, dw), ): difA = abs(cpuA - devA) avgval = np.average(abs(cpuA)) maxdif = difA.max() max_err = maxdif if avgval == 0 else maxdif / avgval l2_err = np.sqrt( np.square(difA).sum()) / np.sqrt( np.square(cpuA).sum()) #print("max_err: %5.3f, max_val: %7.3f, l1_err: %7.5f, l2_err: %7.5f" % (difO.max(), cpuO.max(), l1_err, l2_err)) print("%s max_err%%:%11.8f L2_err: %12.10f" % (op, 100 * max_err, l2_err)) # rtol = 1e-4 if dtF is tf.float32 else 1e-1 # self.assertAllClose(devA, cpuA, rtol=rtol, atol=rtol) if out: dim = bsmm.K if op == "dw:" else N np.savetxt("out.txt", difA.reshape((-1, dim)), fmt='%5.1f') np.savetxt("outC.txt", cpuA.reshape((-1, dim)), fmt='%5.1f') np.savetxt("outD.txt", devA.reshape((-1, dim)), fmt='%5.1f') exit() print("")
def testLSTMGates(self): config = tf.ConfigProto( intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) with self.test_session(config=config) as sess: for shape1 in shapes: shape4 = [shape1[0], shape1[1]*4] for dtype in (tf.float32, tf.float16): #tf.float16, tf.bfloat16 np.random.seed(int(time())) cpuC = np.random.uniform(-1.0, 1.0, shape1 ).astype(np.float32) cpuH = np.random.uniform(-1.0, 1.0, shape4 ).astype(np.float32) cpuE = np.random.uniform(-1.0, 1.0, shape1 ).astype(np.float32) cpuB = np.random.uniform(-1.0, 1.0, shape4[1:]).astype(np.float32) cpuG = np.random.uniform(-1.0, 1.0, shape4[1:]).astype(np.float32) results = [] for device in ("gpu", "cpu"): with tf.device("/%s:0" % device), tf.name_scope(device): c = tf.placeholder(tf.float32, cpuC.shape, name="c") h = tf.placeholder(tf.float32, cpuH.shape, name="h") e = tf.placeholder(tf.float32, cpuE.shape, name="e") b = tf.placeholder(tf.float32, cpuB.shape, name="b") g = tf.placeholder(tf.float32, cpuB.shape, name="g") feed_dict = { c : cpuC, h : cpuH, e : cpuE, b : cpuB, g : cpuG, } if device == "gpu" and dtype is not tf.float32: cf = ew.float_cast(c, dtype=dtype) hf = ew.float_cast(h, dtype=dtype) else: cf, hf = c, h if layernorm: hf = norms.layer_norm(hf, g, b, axis=1, segments=4) bias = None else: bias = b cf, hf = lstm.fused_lstm_gates(cf, hf, bias=bias, forget_bias=1.0) if device == "gpu" and dtype is not tf.float32: cf = ew.float_cast(cf, dtype=tf.float32, dx_dtype=dtype) hf = ew.float_cast(hf, dtype=tf.float32, dx_dtype=dtype) if layernorm: dc, dh, dg, db = tf.gradients([cf, hf], [c, h, g, b], [None, e]) results.append( sess.run( [ cf, hf, dc, dh, dg, db ], feed_dict ) ) labels = [" c", " h", "dc", "dh", "dg", "db"] else: dc, dh, db = tf.gradients([cf, hf], [c, h, b], [None, e]) results.append( sess.run( [ cf, hf, dc, dh, db ], feed_dict ) ) labels = [" c", " h", "dc", "dh", "db"] for op, dev, cpu in zip(labels, results[0], results[1]): dif = np.abs(cpu - dev) avgval = np.average(abs(cpu)) maxdif = dif.max() max_err = maxdif if avgval == 0 else maxdif / avgval l2_err = np.sqrt(np.square(dif).sum()) / np.sqrt(np.square(cpu).sum()) print("%s, shape:%12s, op:%s, err:%17.12f, l2_err:%17.12f" % (dtype.name, str(cpu.shape), op, maxdif, l2_err))
def testEwOps(self): with self.test_session() as sess, tf.device("/gpu:0"): for shape in ((32, 1024), ): # (31,31*4), (11,1023), (33,33), for dtypeF, dtypeB in ( (np.float16, np.float16), (np.float32, np.float32) ): #, (np.float32, np.float32), (np.float16, np.float16), (np.float16, np.float32), dtypeF = np.dtype(dtypeF) # Forward dtypeB = np.dtype(dtypeB) # Backwards rtol = 1e-4 if dtypeF.type is np.float32 else 1e-1 with tf.name_scope("S%dx%dF%dB%d" % (shape[0], shape[1], dtypeF.itemsize, dtypeB.itemsize)): if ones: np_X = np.ones(shape, dtype=np.float32) np_Y = np.ones(shape, dtype=np.float32) np_E = np.ones(shape, dtype=np.float32) np_B = np.ones((1, shape[1]), dtype=np.float32) else: # np_X = np.random.normal(0.0, 10.0, shape).astype(dtypeF).astype(np.float32) # np_E = np.random.normal(0.0, 10.0, shape).astype(dtypeF).astype(np.float32) # np_X.fill(10.0) np_X = np.random.uniform( 0.01, 1.0, shape).astype(dtypeF).astype(np.float32) np_Y = np.random.uniform( 0.01, 1.0, shape).astype(dtypeF).astype(np.float32) np_E = np.random.uniform( 0.01, 1.0, shape).astype(dtypeB).astype(np.float32) np_B = np.random.uniform( 0.01, 1.0, (1, shape[1])).astype(np.float32) x = tf.constant(np_X.astype(dtypeF)) y = tf.constant(np_Y.astype(dtypeF)) e = tf.constant(np_E.astype(dtypeB)) b = tf.constant(np_B) X = tf.constant(np_X) Y = tf.constant(np_Y) E = tf.constant(np_E) B = tf.constant(np_B) tests = list() # xx = tf.ones(shape, dtype=tf.float32) # ee = tf.ones(shape, dtype=tf.float32) # ew_op1 = ew.dropout(xx, keep_prob=0.5, scale=2.0) # ew_op2 = ew.dropout(xx, mask=ew_op1[1], scale=2.0) # dx_op = tf.gradients(ew_op1[0], [xx], ee) # (z1, m), z2, (dx,) = sess.run( [ew_op1, ew_op2, dx_op] ) # #print(dx[0,0:8]) # print(z1.sum()/z1.size, dx.sum()/dx.size, (z1 - z2).sum(), (z1 - dx).sum()) # z = sess.run( ew.sparse_relu(x) ) # Z = ew.sparse_relu_test(np_X) # tests.append(("sps_relu: Z ", Z, z)) # Non-Broadcast Binary Ops for name, tf_op, ew_op in ( (" add", tf.add, ew.add), (" mul", tf.multiply, ew.multiply), (" sub", tf.subtract, ew.subtract), (" div", tf.divide, ew.divide), (" max", tf.maximum, ew.maximum), (" min", tf.minimum, ew.minimum), ): # I think tf doesn't use fmaxf/fminf and hence has different behaviour for equal numbers. # In fp32 the chance for equality is very small, but not so in fp16 if name[-3:] in ("max", "min" ) and dtypeF.type is np.float16: continue tf_op = tf_op(X, Y) ew_op = ew_op(x, y) Z, z = sess.run([tf_op, ew_op]) DX, DY = sess.run(tf.gradients(tf_op, [X, Y], E)) dx, dy = sess.run(tf.gradients(ew_op, [x, y], e)) tests.append((name + ": Z ", Z, z)) tests.append((name + ": DX", DX, dx)) tests.append((name + ": DY", DY, dy)) for name, tf_op, ew_op in ((" add_n", tf.add_n, ew.add_n8_op), ): tf_op2 = tf_op([X, Y]) ew_op2 = ew_op([x, y]) tf_op3 = tf_op([X, Y, E]) ew_op3 = ew_op([x, y, e]) Z2, z2 = sess.run([tf_op2, ew_op2]) Z3, z3 = sess.run([tf_op3, ew_op3]) tests.append((name + ": Z2", Z2, z2)) tests.append((name + ": Z3", Z3, z3)) # Unary Ops for name, tf_op, ew_op in ( (" sig", tf.sigmoid, ew.sigmoid), (" tanh", tf.tanh, ew.tanh), ( " neg", tf.negative, ew.negative, ), ( " rcp", tf.reciprocal, ew.reciprocal, ), ( " sqr", tf.square, ew.square, ), ( " sqrt", tf.sqrt, ew.sqrt, ), ( " exp", tf.exp, ew.exp, ), ( " log", tf.log, ew.log, ), ( " relu", tf.nn.relu, ew.relu, ), ( " elu", tf.nn.elu, ew.elu, ), ( " gelu", gelu, ew.gelu, ), ( " swish", swish, ew.swish, ), ( "fast_gelu", fast_gelu, ew.fast_gelu, ), ): tf_op = tf_op(X) ew_op = ew_op(x) Z, z = sess.run([tf_op, ew_op]) DX, = sess.run(tf.gradients(tf_op, [X], E)) dx, = sess.run(tf.gradients(ew_op, [x], e)) tests.append((name + ": Z ", Z, z)) tests.append((name + ": DX", DX, dx)) # Broadcast Binary Ops for name, tf_op, ew_op in ( ( "bias_add", tf.add, ew.add, ), ("bias_mul", tf.multiply, ew.multiply), ): tf_op = tf_op(X, B) ew_op = ew_op(x, b) Z, z = sess.run([tf_op, ew_op]) DX, DB = sess.run(tf.gradients(tf_op, [X, B], E)) dx, db = sess.run(tf.gradients(ew_op, [x, b], e)) tests.append((name + ": Z ", Z, z)) tests.append((name + ": DX", DX, dx)) tests.append((name + ": DB", DB, db)) # Up Cast ew_op = ew.float_cast(x, dtype=tf.float32, dx_dtype=dtypeB.type) z = sess.run(ew_op) dx, = sess.run(tf.gradients(ew_op, [x], e)) tests.append((" upCast: Z ", np_X, z)) tests.append((" upCast: DX", np_E, dx)) #Down Cast if dtypeF.type is np.float32: Z = np_X.astype(np.float16) DX = np_E.astype(np.float16) e16 = tf.constant(DX) ew_op = ew.float_cast(x, dtype=tf.float16) z = sess.run(ew_op) dx, = sess.run(tf.gradients(ew_op, [x], e16)) tests.append(("downCast: Z ", Z, z)) tests.append(("downCast: DX", DX, dx)) for op, tfT, ewT in (tests): dif = tfT - ewT avgval = abs(tfT).sum() / tfT.size maxdif = abs(dif).max() ratio = maxdif / avgval print( "dtypeF:f%d, dtypeB:f%d, shape:%s, op:%s err:%17.12f" % (dtypeF.itemsize, dtypeB.itemsize, str(shape), op, ratio)) # print(ewT[0,0,:,:]) # print(tfT[0,0,:,:]) # exit() if out: # and ratio > 1.0: np.savetxt("out.txt", dif, fmt='%5.2f') np.savetxt("outC.txt", tfT, fmt='%5.2f') np.savetxt("outD.txt", ewT, fmt='%5.2f') exit()
def testBlocksparseTransformerDense(self): with self.test_session(config=config) as sess, tf.device("/gpu:0"): for bsize in (16, 32, 64): layout = np.ones([heads, ctx, ctx], dtype=np.bool) bst = trans.BlocksparseTransformer(layout, block_size=bsize) shape = (batch, ctx * bsize, heads * state) if ones: cpuQ = np.ones(shape, dtype=np.float32) cpuK = np.ones(shape, dtype=np.float32) cpuV = np.ones(shape, dtype=np.float32) cpuE = np.ones(shape, dtype=np.float32) else: cpuQ = np.random.uniform(-1.0, 1.0, shape).astype( np.float16).astype(np.float32) cpuK = np.random.uniform(-1.0, 1.0, shape).astype( np.float16).astype(np.float32) cpuV = np.random.uniform(-1.0, 1.0, shape).astype( np.float16).astype(np.float32) cpuE = np.random.uniform(-1.0, 1.0, shape).astype( np.float16).astype(np.float32) q = tf.placeholder(tf.float32, shape) k = tf.placeholder(tf.float32, shape) v = tf.placeholder(tf.float32, shape) e = tf.placeholder(tf.float32, shape) feed_dict = {q: cpuQ, k: cpuK, v: cpuV, e: cpuE} qf = ew.float_cast(q, dtype=tf.float16) kf = ew.float_cast(k, dtype=tf.float16) vf = ew.float_cast(v, dtype=tf.float16) w = bst.query_key_op(qf, kf) w = bst.softmax(w, scale=scale) y = bst.weight_value_op(w, vf) qf = trans.transpose_0213( tf.reshape(qf, [batch, ctx * bsize, heads, state])) kf = trans.transpose_0213( tf.reshape(kf, [batch, ctx * bsize, heads, state])) vf = trans.transpose_0213( tf.reshape(vf, [batch, ctx * bsize, heads, state])) W = tf.matmul(qf, kf, transpose_b=True) W = trans.softmax(W, scale=scale) Y = tf.matmul(W, vf) Y = tf.reshape(trans.transpose_0213(Y), [batch, ctx * bsize, heads * state]) y = ew.float_cast(y, dtype=tf.float32) Y = ew.float_cast(Y, dtype=tf.float32) y, (dq, dk, dv) = sess.run([y, tf.gradients(y, [q, k, v], e)], feed_dict) Y, (DQ, DK, DV) = sess.run([Y, tf.gradients(Y, [q, k, v], e)], feed_dict) print("testBlocksparseTransformerDense", bsize) if not bench: for op, dev, cpu in [ [" Y", y, Y], ["DV", dv, DV], ["DK", dk, DK], ["DQ", dq, DQ], ]: self.compare_results(op, dev, cpu)
def testBlocksparseTransformerSparse(self): with self.test_session(config=config) as sess, tf.device("/gpu:0"): for bsize in (16, 32, 64): layout = np.ones([heads, ctx, ctx], dtype=np.bool) for q, k in np.ndindex(ctx, ctx): if k > q: layout[:, q, k] = 0 bst = trans.BlocksparseTransformer(layout, block_size=bsize, mask_callback=mask_callback) shape = (batch, ctx * bsize, heads * state) if ones: cpuQ = np.ones(shape, dtype=np.float32) cpuK = np.ones(shape, dtype=np.float32) cpuV = np.ones(shape, dtype=np.float32) cpuE = np.ones(shape, dtype=np.float32) else: cpuQ = np.random.uniform(-1.0, 1.0, shape).astype( np.float16).astype(np.float32) cpuK = np.random.uniform(-1.0, 1.0, shape).astype( np.float16).astype(np.float32) cpuV = np.random.uniform(-1.0, 1.0, shape).astype( np.float16).astype(np.float32) cpuE = np.random.uniform(-1.0, 1.0, shape).astype( np.float16).astype(np.float32) q = tf.placeholder(tf.float32, shape) k = tf.placeholder(tf.float32, shape) v = tf.placeholder(tf.float32, shape) e = tf.placeholder(tf.float32, shape) feed_dict = {q: cpuQ, k: cpuK, v: cpuV, e: cpuE} qf = ew.float_cast(q, dtype=tf.float16) kf = ew.float_cast(k, dtype=tf.float16) vf = ew.float_cast(v, dtype=tf.float16) w = bst.query_key_op(qf, kf) w = bst.masked_softmax(w, scale=scale) y = bst.weight_value_op(w, vf) y = ew.float_cast(y, dtype=tf.float32) dq, dk, dv = tf.gradients(y, [q, k, v], e) y, dq, dk, dv = sess.run([y, dq, dk, dv], feed_dict) W = bst.nt_test(cpuQ, cpuK) W = bst.masked_softmax_test(W, scale=scale) Y = bst.nn_test(W, cpuV) DV = bst.tn_test(W, cpuE) DW = bst.nt_test(cpuE, cpuV) DW = bst.masked_softmax_grad_test(DW, W, scale=scale) DQ = bst.nn_test(DW, cpuK) DK = bst.tn_test(DW, cpuQ) print("testBlocksparseTransformerSparse", bsize) if not bench: for op, dev, cpu in [ [" Y", y, Y], ["DV", dv, DV], ["DK", dk, DK], ["DQ", dq, DQ], ]: self.compare_results(op, dev, cpu)
def forward(self, inputs, states, ema=None): hps = self.hps bsmm = hps.bsmm with tf.variable_scope(self.scope) as scope: self.param_names = list("amifou") for i in range(1 if hps.share_isteps else hps.isteps): self.param_names.append("h%d" % i) self.params = dict() for p in self.param_names: bsmm_p, size = (bsmm["x"], hps.nproj_in) if p in "am" else (bsmm[p], hps.nhidden) b_init = ones_initializer() if p == 'f' else zeros_initializer( ) w = hps.get_variable("w_" + p, bsmm_p.w_shape, bsmm_p.identity_init()) g = hps.get_variable("g_" + p, [size], ones_initializer()) b = hps.get_variable("b_" + p, [size], b_init) if ema is not None: w = ema.average(w) g = ema.average(g) b = ema.average(b) wc = ew.float_cast(w, dtype=hps.dtype) self.params[p] = (wc, g, b, w) c, h = tf.unstack(states, num=2) c = ew.float_cast(c, dtype=hps.dtype) h = ew.float_cast(h, dtype=hps.dtype) wm, gm, bm = self.params["m"][0:3] wa, ga, ba = self.params["a"][0:3] self.inputs = inputs self.outputs = [] self.segments = [] for xgroup in inputs: if hps.recompute and self.train: # We compute gradient one segment at a time, so prevent tf.gradients from going too far. # We also want to add control inputs to the start of the segment so having wrappers # around the segment inputs is handy. seg = [(tf.stop_gradient(c), tf.stop_gradient(h))] self.segments.append(seg) # delay input expansion to just prior to use (saves memory) with tf.control_dependencies([h]): xwm = bsmm["x"](xgroup, wm, dw_dtype=hps.dw_dtype) xwa = bsmm["x"](xgroup, wa, dw_dtype=hps.dw_dtype) xwm = tf.split(xwm, hps.x_group_size, 1 - hps.axis) xwa = tf.split(xwa, hps.x_group_size, 1 - hps.axis) masks = [] for m, a in zip(xwm, xwa): m = layer_norm(m, gm, bm, axis=hps.axis) a = layer_norm(a, ga, ba, axis=hps.axis) c, h, mask = self.cell(c, h, m, a) _masks = [mask] for _ in range(1, hps.lsteps): c, h, mask = self.cell(c, h, None, None) _masks.append(mask) masks.append(_masks) self.outputs.append(h) if hps.recompute and self.train: with tf.name_scope("f_seg_%04d_%d" % (len(self.segments) - 1, len(seg) - 1)): c_seg, h_seg = seg[0] with tf.control_dependencies([h_seg]): xwm = bsmm["x"](xgroup, wm, dw_dtype=hps.dw_dtype) xwa = bsmm["x"](xgroup, wa, dw_dtype=hps.dw_dtype) xwm = tf.split(xwm, hps.x_group_size, 1 - hps.axis) xwa = tf.split(xwa, hps.x_group_size, 1 - hps.axis) for m, a, mask in zip(xwm, xwa, masks): m = layer_norm(m, gm, bm, axis=hps.axis) a = layer_norm(a, ga, ba, axis=hps.axis) c_seg, h_seg, _ = self.cell( c_seg, h_seg, m, a, mask[0]) for i in range(1, hps.lsteps): c_seg, h_seg, _ = self.cell( c_seg, h_seg, None, None, mask[i]) seg.append((c_seg, h_seg)) c = ew.float_cast(c, dtype=tf.float32) h = ew.float_cast(h, dtype=tf.float32) states = tf.stack([c, h], 0) # We calculate the gradient internally. # Don't let other layer's gradients flow into here. # This is possible because the last cell has free c and h # params that are popluated with zeros in the gradients pass. outputs = [tf.stop_gradient(x) for x in self.outputs] return outputs, states
def fp16(x): # no need to cast the gradients back to fp32 as the all-reduce and optimizers handle fp16/fp32 mixed precision return float_cast(x, dtype=tf.float16, dx_dtype=tf.float16)
def testAdafactor(self): with self.test_session(config=config) as sess, tf.device("/gpu:0"): for dtype in (tf.float32, tf.float16): # tf.float16 for shape_g in ( (1024, 1024 * 2), (1, 1024 * 2), (1024, 1023 * 1), (1, 1023 * 1), ): shape_c = (1, shape_g[1]) shape_r = (shape_g[0], 1) if ones: G = np.ones(shape_g, dtype=np.float32) P = np.ones(shape_g, dtype=np.float32) C = np.zeros(shape_c, dtype=np.float32) R = np.zeros(shape_r, dtype=np.float32) else: G = np.random.uniform(-1.0, 1.0, shape_g).astype( np.float16).astype(np.float32) P = np.random.uniform(-1.0, 1.0, shape_g).astype( np.float16).astype(np.float32) C = np.random.uniform(0.0, 1.0, shape_c).astype( np.float16).astype(np.float32) R = np.random.uniform(0.0, 1.0, shape_r).astype( np.float16).astype(np.float32) g = tf.placeholder(tf.float32, G.shape) p = tf.Variable(initial_value=P, name="p") c = tf.Variable(initial_value=C, name="c") r = tf.Variable(initial_value=R, name="r") sess.run(tf.global_variables_initializer()) g = ew.float_cast(g, dtype=dtype) # adafactor has it's own fused infinity filtering but quick test of this standalone op here. g = ew.filter_infinity(g) if shape_g[0] > 1: p, c, r, x, _ = sess.run(adafactor2d_op( p, c, r, g, beta2, learn_rate, grad_scale, clip_thresh, epsilon=epsilon, zero_nans=True), feed_dict={g: G}) C = beta2 * C + (1.0 - beta2) * np.mean( np.square(G) + epsilon, axis=0, keepdims=True) R = beta2 * R + (1.0 - beta2) * np.mean( np.square(G) + epsilon, axis=1, keepdims=True) LTM = np.mean(R, keepdims=True) X = G / (np.sqrt(R / LTM) * np.sqrt(C)) RMS_X = np.sqrt(np.mean(np.square(X), keepdims=True)) else: r = R p, c, x, _ = sess.run(adafactor1d_op(p, c, g, beta2, learn_rate, grad_scale, clip_thresh, epsilon=epsilon, zero_nans=True), feed_dict={g: G}) C = beta2 * C + (1.0 - beta2) * (np.square(G) + epsilon) X = G / np.sqrt(C) RMS_X = np.sqrt(np.mean(np.square(X), keepdims=True)) P -= learn_rate * X / np.maximum(1.0, RMS_X / clip_thresh) print("testAdafactor", dtype) for op, dev, cpu in [ ["C", c, C], ["R", r, R], ["X", x, X], ["P", p, P], ]: self.compare_results(op, dev, cpu)
def forward(self, inputs, states, ema=None): hps = self.hps bsmm = hps.bsmm with tf.variable_scope(self.scope) as scope: self.param_names = list("am") for i in range(1 if hps.share_isteps else hps.isteps): self.param_names.append("h%d" % i) self.params = dict() for p in self.param_names: bsmm_p, size = (bsmm["x"], hps.nproj_in) if p in "am" else (bsmm[p], hps.nhidden) w = hps.get_variable("w_" + p, bsmm_p.w_shape, bsmm_p.identity_init()) g = hps.get_variable("g_" + p, [size], ones_initializer()) b = hps.get_variable("b_" + p, [size], zeros_initializer()) if ema is not None: w = ema.average(w) g = ema.average(g) b = ema.average(b) wc = ew.float_cast(w, dtype=hps.dtype) self.params[p] = (wc, g, b, w) c, h = tf.unstack(states, num=2) h = ew.float_cast(h, dtype=hps.dtype) wm, gm, bm = self.params["m"][0:3] wa, ga, ba = self.params["a"][0:3] self.inputs = inputs self.outputs = [] self.segments = [] for xgroup in inputs: # delay input expansion to just prior to use (saves memory) with tf.control_dependencies([h]): xwm = bsmm["x"](xgroup, wm, dw_dtype=hps.dw_dtype) xwa = bsmm["x"](xgroup, wa, dw_dtype=hps.dw_dtype) xwm = tf.split(xwm, hps.x_group_size, 1 - hps.axis) xwa = tf.split(xwa, hps.x_group_size, 1 - hps.axis) masks = [] for m, a in zip(xwm, xwa): m = layer_norm(m, gm, bm, axis=hps.axis) a = layer_norm(a, ga, ba, axis=hps.axis) h = self.cell(h, m, a) self.outputs.append(h) h = ew.float_cast(h, dtype=tf.float32) states = tf.stack([c, h], 0) # We calculate the gradient internally. # Don't let other layer's gradients flow into here. # This is possible because the last cell has free c and h # params that are popluated with zeros in the gradients pass. outputs = [tf.stop_gradient(x) for x in self.outputs] return outputs, states
def testLayerNorm(self): # multi-threading screws up benchmarking conf = tf.ConfigProto( intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) with self.test_session(config=conf) as sess, tf.device("/gpu:0"): for shape in shapes: # assume bigger axis is feature axis axis = 1 # 0 if shape[0] > shape[1] else 1 K = shape[ axis] N = shape[1-axis] if one: X = np.ones(shape, dtype=np.float32) E = np.ones(shape, dtype=np.float32) G = np.ones( K, dtype=np.float32) B = np.ones( K, dtype=np.float32) # for n in range(N): # X[:,n] = np.arange(K) else: X = np.random.uniform(-1.0, 1.0, shape).astype(np.float32) E = np.random.uniform(-1.0, 1.0, shape).astype(np.float32) G = np.random.uniform(-1.0, 1.0, (K,)).astype(np.float32) B = np.random.uniform(-1.0, 1.0, (K,)).astype(np.float32) x = tf.constant(X) e = tf.constant(E) g = tf.constant(G) b = tf.constant(B) for dtype in dtypes: # just test relu on floats (it's hard to match low precision relu with high precision behavior) relu = dtype is tf.float32 print("K:%d N:%d Axis:%d Relu:%d dtype:%s" % (K, N, axis, relu, dtype.name)) Y = layer_norm_test(X, G, B, axis=axis, segments=segments, relu=relu) DX, DG, DB = layer_norm_grad_test(E, X, G, B, axis=axis, segments=segments, relu=relu) y = ew.float_cast(x, dtype=dtype) y = layer_norm(y, g, b, axis=axis, segments=segments, relu=relu, bench=bench) y = ew.float_cast(y, dtype=tf.float32, dx_dtype=dtype) d = tf.gradients(y, [x, g, b], e) #if bench: sess.run(y) #warmup y, (dx, dg, db) = sess.run( [y, d] ) #y, = sess.run( [y,] ) if bench == 0: for op, cpuA, devA in ( (" y:", Y, y), ("dx:", DX, dx), ("dg:", DG, dg), ("db:", DB, db),): difA = abs(cpuA - devA) avgval = np.average(abs(cpuA)) maxdif = difA.max() max_err = maxdif if avgval == 0 else maxdif / avgval l2_err = np.sqrt(np.square(difA).sum()) / np.sqrt(np.square(cpuA).sum()) #print("max_err: %5.3f, max_val: %7.3f, l1_err: %7.5f, l2_err: %7.5f" % (difO.max(), cpuO.max(), l1_err, l2_err)) print("%s max_err%%:%10.8f L2_err: %12.10f" % (op, 100*max_err, l2_err)) # rtol = 1e-4 if dtype is tf.float32 else 1e-1 # self.assertAllClose(devA, cpuA, rtol=rtol, atol=rtol) if out: np.savetxt("out.txt", difA.reshape((-1,N)), fmt='%7.3f') np.savetxt("outC.txt", cpuA.reshape((-1,N)), fmt='%7.3f') np.savetxt("outD.txt", devA.reshape((-1,N)), fmt='%7.3f') exit() print("")
def testBlocksparseTransformerMatmul(self): with self.test_session(config=config) as sess, tf.device("/gpu:0"): for bsize in (16, 32, 64): # 16, 32, 64 layout = np.ones([1, ctx, ctx], dtype=np.bool) for q, k in np.ndindex(ctx, ctx): if k > q: layout[:, q, k] = 0 #layout[:,0,:] = 1 bst = trans.BlocksparseTransformer(layout, heads=heads, block_size=bsize) q_shape = (batch, ctx * bsize, heads * state) w_shape = (batch, heads, bst.blocks, bsize, bsize) if ones: cpuQ = np.ones(q_shape, dtype=np.float32) cpuK = np.ones(q_shape, dtype=np.float32) cpuW = np.ones(w_shape, dtype=np.float32) # cpuQ[0,0,0,:] = 1 # cpuK[0,0,0,:] = range(64) # cpuW[0,0,0,0,:] = 1 else: cpuQ = np.random.uniform(-1.0, 1.0, q_shape).astype( np.float16).astype(np.float32) cpuK = np.random.uniform(-1.0, 1.0, q_shape).astype( np.float16).astype(np.float32) cpuW = np.random.uniform(-1.0, 1.0, w_shape).astype( np.float16).astype(np.float32) q = tf.placeholder(tf.float32, cpuQ.shape) k = tf.placeholder(tf.float32, cpuK.shape) w = tf.placeholder(tf.float32, cpuW.shape) feed_dict = {q: cpuQ, k: cpuK, w: cpuW} qf = ew.float_cast(q, dtype=tf.float16) kf = ew.float_cast(k, dtype=tf.float16) wf = ew.float_cast(w, dtype=tf.float16) nt = bst.nt_op(qf, kf, bench=bench) nn = bst.nn_op(wf, kf, bench=bench) tn = bst.tn_op(wf, qf, bench=bench) nt = ew.float_cast(nt, dtype=tf.float32) nn = ew.float_cast(nn, dtype=tf.float32) tn = ew.float_cast(tn, dtype=tf.float32) #dx, db = tf.gradients(y, [x, b], e) print("testBlocksparseTransformerMatmul", bsize) nt, nn, tn = sess.run([nt, nn, tn], feed_dict) if not bench: NT = bst.nt_test(cpuQ, cpuK) NN = bst.nn_test(cpuW, cpuK) TN = bst.tn_test(cpuW, cpuQ) for op, dev, cpu in [ ["NT", nt, NT], ["NN", nn, NN], ["TN", tn, TN], ]: self.compare_results(op, dev, cpu)
def group_param_grads(param_grad, group_size=8, cast32=False): assert group_size <= 8 # backward walk param grad to find BlocksparseMatmulDW ops # this should only hit BlocksparseMatmulDWs or AddNs or FloatCasts ops = get_parents(param_grad, "BlocksparseMatmulDW") # this sorting is dependent on the op names being correctly ordered. ops.sort(key=lambda op: op.name.split('/')[-1], reverse=True) # for x in ops: # print(x.name) # print("") # exit() # use the parent scope for the new ops scope = ops[-1].name.split('/') scope = '/'.join(scope[0:-1]) # we're going to be using absolute names, so clear name_scope with tf.name_scope(None): offset = 0 # graph = tf.get_default_graph() while offset < len(ops): xs = [op.inputs[0] for op in ops[offset:offset + group_size]] gs = [op.inputs[1] for op in ops[offset:offset + group_size]] # Get the corresponding activation grad op for the last param grad op in the group bprop = None for op in gs[-1].consumers(): if op.type == "BlocksparseMatmulDX": bprop = op assert bprop is not None # get attributes of first op in group up = ops[offset] blocks = up.get_attr("blocks") bshift = up.get_attr("bshift") axis = up.get_attr("axis") dtype_dw = up.get_attr("dtype_dw") gated_dw = up.get_attr("gated_dw") C = up.get_attr("C") K = up.get_attr("K") bench = up.get_attr("bench") // len(xs) lut = up.inputs[2] name = "%s/matmul_concat_updat_%03d" % (scope, offset) gate = [up.inputs[3]] if len(op.inputs) > 3 else [] # The first op needs to allocate a new dw tensor if offset == 0: grad = blocksparse_matmul_dw(xs, gs, lut, gate, dtype_dw=dtype_dw, gated_dw=gated_dw, blocks=blocks, bshift=bshift, axis=axis, C=C, K=K, bench=bench, name=name) # subsequent ops can just accumulate in place else: grad = blocksparse_matmul_dwa(xs, gs, lut, grad, gate, gated_dw=gated_dw, blocks=blocks, bshift=bshift, axis=axis, C=C, K=K, bench=bench, name=name) # print(grad.op.name, grad.op.device) # force the dw op before any more time steps are processed add_control_input(bprop, grad.op) #print(grad.op.name) offset += group_size # get the grad back to float32 if requested # TODO: splice the graph instead of this hack if cast32 and dtype_dw != tf.float32: grad = ew.float_cast(grad, dtype=tf.float32) return grad
def testEmbeddingLookup(self): config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) with self.test_session(config=config) as sess: for shapeW, shapeI in shapes: C = shapeW[0] shapeY = shapeI + shapeW[1:] np.random.seed(int(time())) cpuI = np.random.randint(0, C, size=shapeI, dtype=np.int32) cpuW = np.random.uniform(-1.0, 1.0, shapeW).astype(np.float32) cpuE = np.random.uniform(-1.0, 1.0, shapeY).astype(np.float32) for dtype in ( tf.float32, tf.float16, ): #tf.float16, tf.float32 for sort in (True, False): results = [] for device in ("gpu", "cpu"): if bench and device == "cpu": break castW = device == "gpu" and dtype is not tf.float32 if castW: if C <= 256: castI = tf.uint8 elif C <= 65536: castI = tf.uint16 else: castI = None else: castI = None with tf.device("/%s:0" % device), tf.name_scope(device): i = tf.placeholder(tf.int32, cpuI.shape, name="i") w = tf.placeholder(tf.float32, cpuW.shape, name="w") e = tf.placeholder(tf.float32, cpuE.shape, name="e") feed_dict = {i: cpuI, w: cpuW, e: cpuE} wf = ew.float_cast(w, dtype=dtype) if castW else w i = tf.cast( i, dtype=castI) if castI is not None else i y = embedding_lookup(wf, i, sort_grad=sort, bench=bench) if castW: y = ew.float_cast(y, dtype=tf.float32) dw, = tf.gradients(y, [w], e) results.append(sess.run([y, dw], feed_dict)) if not bench: for op, dev, cpu in zip(["y", "dw"], results[0], results[1]): dif = np.abs(cpu - dev) avgval = np.average(abs(cpu)) maxdif = dif.max() max_err = maxdif if avgval == 0 else maxdif / avgval l2_err = np.sqrt( np.square(dif).sum()) / np.sqrt( np.square(cpu).sum()) print( "%s, shape:%22s, op:%3s, err:%17.12f, l2_err:%17.12f" % (dtype.name, str( cpu.shape), op, max_err, l2_err))
def testFancyGather(self): config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) with self.test_session(config=config) as sess: for shape in shapes: idx_shape = shape[0:2] idx_dim = shape[2] out_shape = idx_shape + shape[3:] for dtype in (tf.float32, ): #tf.float16, tf.bfloat16 #rtol = 1e-4 if dtype is tf.float32 else 1e-1 #tf.reset_default_graph() np.random.seed(int(time())) cpuX = np.random.uniform(-1.0, 1.0, shape).astype(np.float32) cpuA = np.random.randint(0, idx_dim, size=idx_shape, dtype=np.int32) cpuE = np.random.uniform(-1.0, 1.0, out_shape).astype(np.float32) with tf.device("/gpu:0"): x = tf.placeholder(tf.float32, cpuX.shape) a = tf.placeholder(tf.int32, cpuA.shape) e = tf.placeholder(tf.float32, cpuE.shape) feed_dict = {x: cpuX, a: cpuA, e: cpuE} xf = ew.float_cast(x, dtype=dtype) y = ew.float_cast(ew.fancy_gather(xf, a), dtype=tf.float32, dx_dtype=dtype) devY, (devB, ) = sess.run( [y, tf.gradients(y, [x], e)], feed_dict) y = ew.fancy_gather(x, a, use_tf=True) cpuY, (cpuB, ) = sess.run( [y, tf.gradients(y, [x], e)], feed_dict) for op, devT, cpuT in (("devY", devY, cpuY), ("devB", devB, cpuB)): difA = np.abs(cpuT - devT) maxdif = difA.max() sumerr = (difA > .001).sum() poserr = np.argmax(np.abs(difA).reshape(-1)) print( "%s, shape:%22s, op:%s, err:%17.12f, sum_err: %d, pos_err:%d" % (dtype.name, str(shape), op, maxdif, sumerr, poserr))