def embed_input(x, hps): """ embed discrete inputs to continous space and add learned position embeddings """ x_embed = tf.get_variable('x_embed', [hps.n_bin, hps.n_state], initializer=tf.random_normal_initializer(stddev=0.02)) pos_embed = tf.get_variable('pos_embed', [hps.n_x, hps.n_state], initializer=tf.random_normal_initializer(stddev=0.01)) h = tf.add(embedding_lookup(x_embed, x), pos_embed) return h
def testEmbeddingLookup(self): config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) with self.test_session(config=config) as sess: for shapeW, shapeI in shapes: C = shapeW[0] shapeY = shapeI + shapeW[1:] np.random.seed(int(time())) cpuI = np.random.randint(0, C, size=shapeI, dtype=np.int32) cpuW = np.random.uniform(-1.0, 1.0, shapeW).astype(np.float32) cpuE = np.random.uniform(-1.0, 1.0, shapeY).astype(np.float32) for dtype in ( tf.float32, tf.float16, ): #tf.float16, tf.float32 for sort in (True, False): results = [] for device in ("gpu", "cpu"): if bench and device == "cpu": break castW = device == "gpu" and dtype is not tf.float32 if castW: if C <= 256: castI = tf.uint8 elif C <= 65536: castI = tf.uint16 else: castI = None else: castI = None with tf.device("/%s:0" % device), tf.name_scope(device): i = tf.placeholder(tf.int32, cpuI.shape, name="i") w = tf.placeholder(tf.float32, cpuW.shape, name="w") e = tf.placeholder(tf.float32, cpuE.shape, name="e") feed_dict = {i: cpuI, w: cpuW, e: cpuE} wf = ew.float_cast(w, dtype=dtype) if castW else w i = tf.cast( i, dtype=castI) if castI is not None else i y = embedding_lookup(wf, i, sort_grad=sort, bench=bench) if castW: y = ew.float_cast(y, dtype=tf.float32) dw, = tf.gradients(y, [w], e) results.append(sess.run([y, dw], feed_dict)) if not bench: for op, dev, cpu in zip(["y", "dw"], results[0], results[1]): dif = np.abs(cpu - dev) avgval = np.average(abs(cpu)) maxdif = dif.max() max_err = maxdif if avgval == 0 else maxdif / avgval l2_err = np.sqrt( np.square(dif).sum()) / np.sqrt( np.square(cpu).sum()) print( "%s, shape:%22s, op:%3s, err:%17.12f, l2_err:%17.12f" % (dtype.name, str( cpu.shape), op, max_err, l2_err))
def model(xs, ys): with tf.variable_scope("model"): with tf.device("/cpu:0"): global_step = tf.Variable(1.0, trainable=False) learning_rate = tf.minimum( global_step * tf.constant(1.0 / hps.warmup_iters), 1.0) * tf.constant(hps.lr) with tf.device("/gpu:0"): # embed discrete inputs to continous space and add learned position embeddings with tf.variable_scope('embed'): x_embed = fp16( tf.get_variable( "x", [hps.n_vocab, hps.n_state], initializer=tf.random_normal_initializer(stddev=0.02))) pos_embed = fp16( tf.get_variable( 'pos', [1, hps.n_timesteps, hps.n_state], initializer=tf.random_normal_initializer(stddev=0.01))) h = embedding_lookup(x_embed, xs) + pos_embed for l in range(hps.n_layer): h = transformer_block(h, 'layer_%d' % l, hps.n_head, hps.n_timesteps) #average pool transformer features and apply linear classifier with tf.variable_scope('logits'): h = tf.reshape(h, [-1, hps.n_state]) logits = tf.matmul(h, x_embed, transpose_b=True) labels = tf.reshape(ys, [-1]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=fp32(logits), labels=tf.cast(labels, tf.int32)) loss = tf.reduce_mean(loss) params = tf.trainable_variables() grads = tf.gradients(loss * hps.cost_scale, params) mpi_scale = 1.0 / mpi_size if mpi_size > 1: loss = allreduce(loss) * mpi_scale group_allreduce(grads, params, search_strings=[ "layer_%d" % l for l in range(hps.n_layer - 1, -1, -1) ] + ["embed"], cast_all=tf.float16) # for tuning fp16 cost scaling if hps.log_stats and mpi_rank == 0: for i, (grad, param) in enumerate(zip(grads, params)): name = param.op.name + "_" + "_".join( str(x) for x in param.shape.as_list()) grads[i] = log_stats(grad, tf.cast(global_step, tf.int32), logfile="scale_stats.txt", name=name) # use adafactor for most params and adam for embeddings fact_grads = list() adam_grads = list() for grad, param in zip(grads, params): if "embed" in param.op.name: # for input embedding, only update param + running stats when embedding vector was selected by input # more stable learning for rarely used embedding entries # if "x" in param.op.name: # grad.lazy = True adam_grads.append((grad, param)) else: fact_grads.append((grad, param)) fact = AdafactorOptimizer(learning_rate=learning_rate, grad_scale=mpi_scale / hps.cost_scale, sat_infs=True) adam = AdamOptimizer(learning_rate=learning_rate, grad_scale=mpi_scale / hps.cost_scale, sat_infs=True) train_op = tf.group(fact.apply_gradients(fact_grads), adam.apply_gradients(adam_grads)) # update global step after we're done using it for this update with tf.control_dependencies([train_op]), tf.device("/cpu:0"): update_op = tf.assign_add(global_step, 1.0) return loss, tf.group(train_op, update_op)
def model(xs, ys, cost_scale, grad_scale): with tf.variable_scope("model"): with tf.device("/cpu:0"): global_step = tf.Variable(1.0, trainable=False) learning_rate = tf.minimum( global_step * tf.constant(1.0 / hps.warmup_iters), tf.constant(1.0)) * tf.constant(hps.lr) with tf.device("/gpu:0"): # Contains scope/var_name substrings we use to group gradients for all reduce # You'll want to find groupings that are scheduled uniquely by tensorflow, otherwise allreduce could hang. # The groups should be ordered in which the all-reduce is called. # Any gradients not matching the substrings will get appended to the last group. grad_groups = [] # embed discrete inputs to continous space and add learned position embeddings with tf.variable_scope('embed'): x_embed = fp16( tf.get_variable( "x", [hps.n_vocab, hps.n_state], initializer=tf.random_normal_initializer(stddev=0.02))) pos_embed = fp16( tf.get_variable( 'pos', [1, hps.n_timesteps, hps.n_state], initializer=tf.random_normal_initializer(stddev=0.01))) h = embedding_lookup(x_embed, xs) + pos_embed grad_groups.insert(0, 'embed') for l in range(hps.n_layer): layer_name = 'layer_%d' % l h = transformer_block(h, layer_name, hps.n_head, hps.n_timesteps) grad_groups.insert(0, layer_name) #average pool transformer features and apply linear classifier with tf.variable_scope('logits'): h = tf.reshape(h, [-1, hps.n_state]) logits = tf.matmul(h, x_embed, transpose_b=True) # labels = tf.reshape(ys, [-1]) # loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=fp32(logits), labels=tf.cast(labels, tf.int32)) loss = softmax_cross_entropy(logits=logits, labels=ys) loss = tf.reduce_mean(loss) params = tf.trainable_variables() # use scale_tensor so we can keep the cost_scale a host side placeholder grads = tf.gradients(scale_tensor(loss, cost_scale), params) if mpi_size > 1: loss = allreduce(loss) * tf.constant(1.0 / mpi_size) group_allreduce(grads, params, search_strings=grad_groups) global_norm, norm_scale = ClipGlobalNorm(grads, grad_scale=grad_scale, clip_norm=hps.clip_norm) # for tuning fp16 cost scaling if hps.log_stats and mpi_rank == 0: for i, (grad, param) in enumerate(zip(grads, params)): name = param.op.name + "_" + "_".join( str(x) for x in param.shape.as_list()) grads[i] = log_stats(grad, tf.cast(global_step, tf.int32), logfile="scale_stats.txt", name=name) # use adafactor for most params and adam for embeddings fact_grads = list() adam_grads = list() for grad, param in zip(grads, params): if "embed" in param.op.name: # for input embedding, only update param + running stats when embedding vector was selected by input # more stable learning for rarely used embedding entries # Note that we use the x_embed as the output logits projection, so there's little value to using lazy here. # if "x" in param.op.name: # grad.lazy = True adam_grads.append((grad, param)) else: fact_grads.append((grad, param)) fact = AdafactorOptimizer(learning_rate=learning_rate, norm_scale=norm_scale, grad_scale=grad_scale) adam = AdamOptimizer(learning_rate=learning_rate, norm_scale=norm_scale, grad_scale=grad_scale) train_op = tf.group(fact.apply_gradients(fact_grads), adam.apply_gradients(adam_grads)) # update global step after we're done using it for this update with tf.control_dependencies([train_op]), tf.device("/cpu:0"): update_op = tf.assign_add(global_step, 1.0) return loss, tf.group(train_op, update_op), global_norm, norm_scale