def testBiasRelu(self): config = tf.ConfigProto( intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) with self.test_session(config=config) as sess: for shape in shapes: # shape[0] //= 24 # shape[0] *= 512 if ones: cpuX = np.ones(shape, dtype=p.float32) cpuE = np.ones(shape, dtype=p.float32) cpuB = np.ones(shape[1:], dtype=p.float32) else: cpuX = np.random.uniform(-1.0, 1.0, shape).astype(np.float16).astype(np.float32) cpuE = np.random.uniform(-1.0, 1.0, shape).astype(np.float16).astype(np.float32) cpuB = np.random.uniform(-1.0, 1.0, shape[1:]).astype(np.float32) for relu in (True, False): for dtype in (tf.float32, ): #tf.float16, tf.bfloat16 results = [] for device in ("gpu", "cpu"): if bench and device == "cpu": break cast = device == "gpu" and dtype is not tf.float32 with tf.device("/%s:0" % device), tf.name_scope(device): x = tf.placeholder(tf.float32, cpuX.shape) e = tf.placeholder(tf.float32, cpuE.shape) b = tf.placeholder(tf.float32, cpuB.shape) feed_dict = { x: cpuX, e: cpuE, b:cpuB } xc = ew.float_cast(x, dtype=dtype) if cast else x y = ew.bias_relu(xc, b, relu=relu, atomics=atomics, bench=bench) if cast: y = ew.float_cast(y, dtype=tf.float32) dx, db = tf.gradients(y, [x, b], e) results.append( sess.run( [ y, dx, db ], feed_dict ) ) if not bench: for op, dev, cpu in zip(["y", "dx", "db"], results[0], results[1]): dif = np.abs(cpu - dev) avgval = np.average(abs(cpu)) maxdif = dif.max() max_err = maxdif if avgval == 0 else maxdif / avgval l2_err = np.sqrt(np.square(dif).sum()) / np.sqrt(np.square(cpu).sum()) print("%s, shape:%14s, op:%3s(%d), err:%17.12f, l2_err:%17.12f" % (dtype.name, str(cpu.shape), op, relu, maxdif, l2_err))
def conv1d(x, scope, nf, relu=False): with tf.variable_scope(scope): nx = x.shape[-1].value ndims = x.shape.ndims w = tf.get_variable("w", [nx, nf], initializer=tf.random_normal_initializer(stddev=0.02)) b = tf.get_variable("b", [ nf], initializer=tf.constant_initializer(0.0)) # merge context and batch dims for more efficient matmul if ndims > 2: y_shape = tf.concat([tf.shape(x)[: ndims - 1], [nf]], axis=0) x = tf.reshape(x, [-1, nx]) # avoid atomics in bias grad, but be careful as tf handles temp memory badly in the presense of async ops like all-reduce y = bias_relu(tf.matmul(x, fp16(w)), b, relu=relu, atomics=False) if ndims > 2: y = tf.reshape(y, y_shape) return y
def conv1d(x, scope, nf, hps, w_init=tf.random_normal_initializer(stddev=0.02), b_init=tf.constant_initializer(0), relu=False): with tf.variable_scope(scope): nx = x.shape[-1].value ndims = x.shape.ndims w = tf.get_variable("w", [nx, nf], initializer=w_init) b = tf.get_variable("b", [ nf], initializer=b_init) if ndims > 2: y_shape = tf.concat([tf.shape(x)[ : ndims-1], [nf]], axis=0) x = tf.reshape(x, [-1, nx]) scope = tf.get_variable_scope().name w = quantize_pre(w, name=scope+"/pre_w", tag=hps.tag) x = quantize_pre(x, name=scope+"/pre_x", tag=hps.tag) y = tf.matmul(x, w) y = quantize_post(y, name=scope+"/post_x", tag=hps.tag) y = bias_relu(y, b, relu=relu) if ndims > 2: y = tf.reshape(y, y_shape) return y