def MatmulJob(): with flow.scope.placement(device_type, "0:0"): a = flow.get_variable( "a", shape=a_shape, dtype=dtype, initializer=flow.random_uniform_initializer(minval=0, maxval=1), trainable=True, ) b = flow.get_variable( "b", shape=b_shape, dtype=dtype, initializer=flow.random_uniform_initializer(minval=0, maxval=1), trainable=True, ) if data_type == "float16": out = flow.matmul( flow.cast(a, dtype=flow.float16), flow.cast(b, dtype=flow.float16), transpose_a, transpose_b, alpha, ) c = flow.get_variable( "c", shape=out.shape, dtype=dtype, initializer=flow.random_uniform_initializer(minval=-1, maxval=1), trainable=True, ) loss = flow.cast(out + flow.cast(c, dtype=flow.float16), dtype=flow.float) else: out = flow.matmul(a, b, transpose_a, transpose_b, alpha) c = flow.get_variable( "c", shape=out.shape, dtype=dtype, initializer=flow.random_uniform_initializer(minval=-1, maxval=1), trainable=True, ) loss = out + c flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [0.0001]), momentum=0).minimize(loss) flow.watch(a, test_global_storage.Setter("a")) flow.watch_diff(a, test_global_storage.Setter("a_diff")) flow.watch(b, test_global_storage.Setter("b")) flow.watch_diff(b, test_global_storage.Setter("b_diff")) flow.watch(c, test_global_storage.Setter("c")) flow.watch_diff(c, test_global_storage.Setter("c_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) return loss
def Matmul( x: tp.Numpy.Placeholder((4, 4), dtype=flow.float32), y: tp.Numpy.Placeholder((4, 4), dtype=flow.float32), ) -> tp.Numpy: s = flow.matmul(x, y) flow.watch(s, Watch) z = flow.matmul(s, x) return z
def createOfQNet( input_image: oft.Numpy.Placeholder((BATCH_SIZE, 4, 64, 64), dtype=flow.float32), var_name_prefix: str = "QNet", is_train: bool = True, ) -> oft.Numpy: ( conv1_weight, conv1_bias, conv2_weight, conv2_bias, fc1_weight, fc1_bias, fc2_weight, fc2_bias, ) = getQNetParams(var_name_prefix=var_name_prefix, is_train=is_train) ( conv1_weight, conv1_bias, conv2_weight, conv2_bias, fc1_weight, fc1_bias, fc2_weight, fc2_bias, ) = getQNetParams(var_name_prefix=var_name_prefix, is_train=is_train) conv1 = flow.nn.compat_conv2d(input_image, conv1_weight, strides=[1, 1], padding="same", data_format="NCHW") conv1 = flow.nn.bias_add(conv1, conv1_bias, "NCHW") conv1 = flow.nn.relu(conv1) pool1 = flow.nn.max_pool2d(conv1, 2, 2, "VALID", "NCHW", name="pool1") conv2 = flow.nn.compat_conv2d(pool1, conv2_weight, strides=[1, 1], padding="same", data_format="NCHW") conv2 = flow.nn.bias_add(conv2, conv2_bias, "NCHW") conv2 = flow.nn.relu(conv2) pool2 = flow.nn.max_pool2d(conv2, 2, 2, "VALID", "NCHW", name="pool2") pool2_flatten = flow.reshape(pool2, (BATCH_SIZE, -1)) fc1 = flow.matmul(a=pool2_flatten, b=fc1_weight, transpose_b=True) fc1 = flow.nn.bias_add(fc1, fc1_bias) fc1 = flow.nn.relu(fc1) fc2 = flow.matmul(a=fc1, b=fc2_weight, transpose_b=True) fc2 = flow.nn.bias_add(fc2, fc2_bias) return fc2
def row_parallel_linear( name, x, output_size, weight_initializer, bias_initializer=flow.constant_initializer(0.0), weight_parallel_dist=distribute.get_row_linear_weight_parallel_dist(), bias_parallel_dist=distribute.get_row_linear_bias_parallel_dist(), dropout_rate=0.1, bias_dropout_fusion=True, ): w, b = get_linear_params( name, x.shape[-1], output_size, x.dtype, weight_initializer=weight_initializer, bias_initializer=bias_initializer, weight_parallel_dist=weight_parallel_dist, bias_parallel_dist=bias_parallel_dist, ) # 2d sbp sig: [S(0), S(1)] x [B, S(0)] -> [S(0), P] -> [S(0), B] # data grad 2d sbp sig: [S(0), B] x [B, S(1)](transposed) -> [S(0), S(1)] x = flow.matmul(x, w) x = distribute.forward_p2b_parallel_cast(x) if bias_dropout_fusion: x = flow.nn.fused_bias_add_dropout(x, b, data_format="NHC", rate=dropout_rate) else: x = flow.nn.bias_add(x, b, data_format="NHC") x = flow.nn.dropout(x, rate=dropout_rate) return x
def logits(self, hidden_states, token_embeddings): """ shape sig: (batch_size * seq_length, hidden_size) x (hidden_size, vocab_size)(transposed) -> (batch_size * seq_length, vocab_size) dp sbp sig: S(0) x B -> S(0) 2d sbp sig: [S(0), B] x [B, S(1)](transposed) -> [S(0), S(1)] """ assert len(hidden_states.shape) == 3 assert np.prod( hidden_states.shape[0:2]) == self.batch_size * self.seq_length assert hidden_states.shape[-1] == self.hidden_size assert len(token_embeddings.shape) == 2 assert token_embeddings.shape[0] == self.vocab_size assert token_embeddings.shape[1] == self.hidden_size with distribute.layer_placement_scope(-1): if (hidden_states.shape[0] == self.seq_length and hidden_states.shape[1] == self.batch_size): # [s, b, H] -> [b, s, H] h = flow.transpose(hidden_states, [1, 0, 2]) elif (hidden_states.shape[0] == self.batch_size and hidden_states.shape[1] == self.seq_length): h = hidden_states else: raise ValueError( f"invalid hidden states shape {hidden_states.shape}") # [s, b, H] or [b, s, H] -> [b * s, H] h = flow.flatten(h, start_dim=0, end_dim=1) # 2d sbp sig: [S(0), B] x [B, S(1)](transposed) -> [S(0), S(1)] # grad 2d sbp sig: [S(0), S(1)] x [B, S(0)] -> [S(0), P] -> [S(0), B] h = distribute.backward_p2b_parallel_cast(h) lgs = flow.matmul(h, token_embeddings, transpose_b=True) return lgs
def _AddClassficationLoss(input_blob, label_blob, hidden_size, label_num, initializer_range, scope_name='classification'): with flow.scope.namespace(scope_name): output_weight_blob = flow.get_variable( name="output_weights", shape=[label_num, hidden_size], dtype=input_blob.dtype, # initializer=bert_util.CreateInitializer(initializer_range), initializer=flow.random_normal_initializer( mean=0.0, stddev=initializer_range, seed=None, dtype=None)) output_bias_blob = flow.get_variable( name="output_bias", shape=[label_num], dtype=input_blob.dtype, initializer=flow.constant_initializer(0.0), ) logit_blob = flow.matmul(input_blob, output_weight_blob, transpose_b=True) logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob) pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits( logits=logit_blob, labels=label_blob) loss = pre_example_loss return loss, pre_example_loss, logit_blob
def _AddNextSentenceOutput(input_blob, label_blob, hidden_size, initializer_range): with flow.scope.namespace("cls-seq_relationship"): output_weight_blob = flow.get_variable( name="output_weights", shape=[2, hidden_size], dtype=input_blob.dtype, model_name="weight", initializer=bert_util.CreateInitializer(initializer_range), ) output_bias_blob = flow.get_variable( name="output_bias", shape=[2], dtype=input_blob.dtype, model_name="bias", initializer=flow.constant_initializer(0.0), ) logit_blob = flow.matmul(input_blob, output_weight_blob, transpose_b=True) logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob) pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits( logits=logit_blob, labels=label_blob) loss = pre_example_loss return (loss, pre_example_loss, logit_blob)
def test_fn( a: flow.typing.Numpy.Placeholder(a_shape), b: flow.typing.Numpy.Placeholder(b_shape), c: flow.typing.Numpy.Placeholder(c_shape), ) -> flow.typing.Numpy: var_a = flow.get_variable( name="var_a", shape=a_shape, dtype=flow.float32, initializer=flow.ones_initializer(), distribute=flow.distribute.split(1), ) a = flow.parallel_cast(a, distribute=flow.distribute.split(1)) a = var_a * a out = flow.matmul(a, b) out = flow.parallel_cast( out, distribute=flow.distribute.broadcast(), gradient_distribute=flow.distribute.broadcast(), ) c = flow.parallel_cast(c, distribute=flow.distribute.broadcast()) out = flow.nn.bias_add(out, c) lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.001]) flow.optimizer.SGD(lr_scheduler, momentum=0).minimize(out) return out
def col_parallel_linear( name, x, output_size, weight_initializer, bias_initializer=flow.constant_initializer(0.0), weight_parallel_dist=distribute.get_col_linear_weight_parallel_dist(), bias_parallel_dist=distribute.get_col_linear_bias_parallel_dist(), need_gelu=False, bias_gelu_fusion=True, ): w, b = get_linear_params( name, x.shape[-1], output_size, x.dtype, weight_initializer=weight_initializer, bias_initializer=bias_initializer, weight_parallel_dist=weight_parallel_dist, bias_parallel_dist=bias_parallel_dist, ) # 2d sbp sig: [S(0), B] x [B, S(1)] -> [S(0), S(1)] # data grad 2d sbp sig: [S(0), S(1)] x [B, S(0)](transposed) -> [S(0), P] -> [S(0), B] x = distribute.backward_p2b_parallel_cast(x) x = flow.matmul(x, w) if need_gelu: if bias_gelu_fusion: x = flow.nn.fused_bias_add_gelu(x, b, data_format="NHC") else: x = flow.nn.bias_add(x, b, data_format="NHC") x = flow.math.gelu(x) else: x = flow.nn.bias_add(x, b, data_format="NHC") return x
def gram_matrix(input): b = input.shape[0] ch = input.shape[1] h = input.shape[2] w = input.shape[3] features = flow.reshape(input, [b, ch, h * w]) features_t = flow.transpose(features, [0, 2, 1]) gram = flow.matmul(features, features_t) / (ch * h * w) return gram
def fused_multihead_attn(self, h): assert len(h.shape) == 3 assert h.shape[0] == self.seq_length assert h.shape[1] == self.batch_size assert h.shape[2] == self.hidden_size * 3 qmk, v = flow.nn.fused_self_attention_query_mul_key_and_value( h, head_size=self.head_size, alpha=(1.0 / self.norm_factor)) qmk = self.tril_softmax_dropout(qmk) return flow.matmul(qmk, v)
def inceptionv3(images, labels, trainable=True): conv0 = _conv2d_layer( "conv0", images, filters=32, kernel_size=3, strides=2, padding="VALID" ) conv1 = _conv2d_layer( "conv1", conv0, filters=32, kernel_size=3, strides=1, padding="VALID" ) conv2 = _conv2d_layer( "conv2", conv1, filters=64, kernel_size=3, strides=1, padding="SAME" ) pool1 = flow.nn.max_pool2d( conv2, ksize=3, strides=2, padding="VALID", data_format="NCHW", name="pool1" ) conv3 = _conv2d_layer( "conv3", pool1, filters=80, kernel_size=1, strides=1, padding="VALID" ) conv4 = _conv2d_layer( "conv4", conv3, filters=192, kernel_size=3, strides=1, padding="VALID" ) pool2 = flow.nn.max_pool2d( conv4, ksize=3, strides=2, padding="VALID", data_format="NCHW", name="pool2" ) mixed_0 = InceptionA(pool2, 0) mixed_1 = InceptionA(mixed_0, 1) mixed_2 = InceptionA(mixed_1, 2) mixed_3 = InceptionB(mixed_2, 3) mixed_4 = InceptionC(mixed_3, 4, 128) mixed_5 = InceptionC(mixed_4, 5, 160) mixed_6 = InceptionC(mixed_5, 6, 160) mixed_7 = InceptionC(mixed_6, 7, 192) mixed_8 = InceptionD(mixed_7, 8) mixed_9 = InceptionE(mixed_8, 9) mixed_10 = InceptionE(mixed_9, 10) pool3 = flow.nn.avg_pool2d( mixed_10, ksize=8, strides=1, padding="VALID", data_format="NCHW", name="pool3" ) with flow.scope.namespace("logits"): pool3 = flow.reshape(pool3, [pool3.shape[0], -1]) weight = flow.get_variable( "fc1-weight", shape=(pool3.shape[1], 1001), dtype=flow.float, initializer=flow.truncated_normal(0.816496580927726), model_name="weight", ) bias = flow.get_variable( "fc1-bias", shape=(1001,), dtype=flow.float, initializer=flow.constant_initializer(), model_name="bias", ) fc1 = flow.matmul(pool3, weight) fc1 = flow.nn.bias_add(fc1, bias) return fc1
def self_attn_qk_v_fw_bw( h: flow.typing.Numpy.Placeholder( shape=(seq_len, batch_size, hidden_size), dtype=flow.float32 ) ) -> typing.Tuple[flow.typing.Numpy, flow.typing.Numpy]: var = flow.get_variable( "var", shape=(1,), dtype=flow.float32, initializer=flow.constant_initializer(1.0, dtype=flow.float32), trainable=True, ) h = h * var if fused: flow.watch_diff(h, test_global_storage.Setter("h_grad_fused")) else: flow.watch_diff(h, test_global_storage.Setter("h_grad")) if fp16: h = flow.amp_white_identity(h) alpha = get_alpha(head_size) if fused: (qmk, v) = flow.nn.fused_self_attention_query_mul_key_and_value( h, head_size=head_size, alpha=alpha ) else: h = flow.reshape(h, (seq_len, batch_size, -1, 3 * head_size)) (q, k, v) = ( flow.transpose( flow.slice( h, begin=[None, None, None, head_size * i], size=[None, None, None, head_size], ), perm=[1, 2, 0, 3], ) for i in range(3) ) qmk = flow.matmul(q, k, transpose_b=True, alpha=alpha) h = flow.matmul(qmk, v) loss = flow.math.reduce_sum(h) flow.optimizer.SGD(get_lr_scheduler(), momentum=0).minimize(loss) return (qmk, v)
def _dense_layer( inputs, units, activation=None, use_bias=True, kernel_initializer=None, bias_initializer=None, trainable=True, name=None, ): in_shape = inputs.shape in_num_axes = len(in_shape) assert in_num_axes >= 2 name_prefix = name if name is not None else id_util.UniqueStr("Dense_") inputs = flow.reshape(inputs, (-1, in_shape[-1])) if in_num_axes > 2 else inputs weight = flow.get_variable( name="{}-weight".format(name_prefix), shape=(units, inputs.shape[1]), dtype=inputs.dtype, initializer=kernel_initializer if kernel_initializer is not None else flow.constant_initializer(0), trainable=trainable, model_name="weight", ) weight = flow.identity(weight) weight = flow.repeat(weight, args.num_piece_in_batch) out = flow.matmul(a=inputs, b=weight, transpose_b=True, name="{}_matmul".format(name_prefix)) if use_bias: bias = flow.get_variable( name="{}-bias".format(name_prefix), shape=(units, ), dtype=inputs.dtype, initializer=bias_initializer if bias_initializer is not None else flow.constant_initializer(0), trainable=trainable, model_name="bias", ) bias = flow.identity(bias) bias = flow.repeat(bias, args.num_piece_in_batch) out = flow.nn.bias_add(out, bias, name="{}_bias_add".format(name_prefix)) out = (activation(out, name="{}_activation".format(name_prefix)) if activation is not None else out) out = flow.reshape(out, in_shape[:-1] + (units, )) if in_num_axes > 2 else out return out
def dense(input, units, name, use_bias=False, trainable=True, reuse=False, const_init=False): name_ = name if reuse == False else name + "_reuse" in_shape = input.shape in_num_axes = len(in_shape) assert in_num_axes >= 2 inputs = flow.reshape(input, (-1, in_shape[-1])) if in_num_axes > 2 else input weight = flow.get_variable( name="{}-weight".format(name), shape=(units, inputs.shape[1]), dtype=inputs.dtype, initializer=flow.random_normal_initializer( stddev=0.02) if not const_init else get_const_initializer(), trainable=trainable, reuse=reuse, model_name="weight", ) out = flow.matmul( a=inputs, b=weight, transpose_b=True, name=name_ + "matmul", ) if use_bias: bias = flow.get_variable( name="{}-bias".format(name), shape=(units, ), dtype=inputs.dtype, initializer=flow.random_normal_initializer() if not const_init else get_const_initializer(), trainable=trainable, reuse=reuse, model_name="bias", ) out = flow.nn.bias_add(out, bias, name=name_ + "_bias_add") out = flow.reshape(out, in_shape[:-1] + (units, )) if in_num_axes > 2 else out return out
def xla_matmul_job( a=flow.FixedTensorDef(a_shape, dtype=dtype), b=flow.FixedTensorDef(b_shape, dtype=dtype), ): out = flow.matmul(a, b, transpose_a=trans_a, transpose_b=trans_b) c = flow.get_variable( "c", shape=out.shape, dtype=flow.float, initializer=flow.ones_initializer(), trainable=True, ) out = flow.math.add_n([out, c]) return out
def multihead_attn(self, q, k, v): """ q, k, v shape: (batch_size, num_attn_heads, seq_length, head_size) """ assert all(len(x.shape) == 4 for x in (q, k, v)) assert all(x.shape[0] == self.batch_size for x in (q, k, v)) assert all(x.shape[1] == self.num_heads for x in (q, k, v)) assert all(x.shape[2] == self.seq_length for x in (q, k, v)) assert all(x.shape[3] == self.head_size for x in (q, k, v)) # q * k: batch_matmul # shape sig: (b, n, s, h) x (b, n, h, s)(transposed) -> (b, n, s, s) # data parallel sbp sig: S(0) x S(0) -> S(0) # 2d sbp sig: [S(0), S(1)] x [S(0), S(1)] -> [S(0), S(1)] qmk = flow.matmul(q, k, transpose_b=True, alpha=(1.0 / self.norm_factor)) qmk = self.tril_softmax_dropout(qmk) # w * v: batch_matmul # shape sig: (b, n, s, s) x (b, n, s, h) -> (b, n, s, h) # data parallel sbp sig: S(0) x S(0) -> S(0) # 2d sbp sig: [S(0), S(1)] x [S(0), S(1)] -> [S(0), S(1)] return flow.matmul(qmk, v)
def DynamicReshapeJob(x: oft.ListNumpy.Placeholder(data_shape)): reshape_out1 = flow.reshape(x, (-1, 20)) my_model = flow.get_variable( "my_model", shape=(20, 32), dtype=flow.float, initializer=flow.random_uniform_initializer(minval=-10, maxval=10), trainable=True, ) my_model = flow.cast_to_current_logical_view(my_model) mm_out = flow.matmul(reshape_out1, my_model) reshape_out2 = flow.reshape(mm_out, (-1, 8, 4)) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [0.0001]), momentum=0).minimize(reshape_out2) return reshape_out1
def _FullyConnected(input_blob, input_size, units, activation=None, name=None, weight_initializer=None): weight_blob = flow.get_variable(name=name + '-weight', shape=[input_size, units], dtype=input_blob.dtype, initializer=weight_initializer) bias_blob = flow.get_variable(name=name + '-bias', shape=[units], dtype=input_blob.dtype, initializer=flow.constant_initializer(0.0)) output_blob = flow.matmul(input_blob, weight_blob) output_blob = flow.nn.bias_add(output_blob, bias_blob) return output_blob
def model() -> tp.Numpy: with get_placement(): x = flow.get_variable( name="x", shape=(4, 5), dtype=flow.float32, initializer=flow.random_normal_initializer(mean=10, stddev=1), ) w = flow.get_variable( name="w", shape=(5, 6), dtype=flow.float32, initializer=flow.random_normal_initializer(mean=10, stddev=1), distribute=flow.distribute.split(0), ) y = flow.matmul(x, w) flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [0.01]), momentum=0.9 ).minimize(y) return y
def matmul_job() -> typing.Tuple[flow.typing.Numpy, flow.typing.Numpy, flow.typing.Numpy, flow.typing.Numpy]: a_var = flow.get_variable( "a", shape=a_shape, dtype=flow.float32, initializer=flow.random_uniform_initializer(minval=0, maxval=1), trainable=True, ) b_var = flow.get_variable( "b", shape=b_shape, dtype=flow.float32, initializer=flow.random_uniform_initializer(minval=0, maxval=1), trainable=True, ) flow.watch_diff(a_var, test_global_storage.Setter("a_diff")) flow.watch_diff(b_var, test_global_storage.Setter("b_diff")) if dtype is flow.float16: a = flow.amp_white_identity(a_var) b = flow.amp_white_identity(b_var) else: a = a_var b = b_var c = flow.matmul(a, b, trans_a, trans_b, alpha) add_to = flow.get_variable( "c", shape=c.shape, dtype=flow.float32, initializer=flow.random_uniform_initializer(minval=-1, maxval=1), trainable=True, ) if test_add_to_output: flow.watch_diff(add_to, test_global_storage.Setter("add_to_diff")) if dtype is flow.float16: add_to = flow.amp_white_identity(add_to) c = c + add_to flow.watch_diff(c, test_global_storage.Setter("c_diff")) get_optimizer().minimize(c) return (a_var, b_var, add_to, c)
def _AddMaskedLanguageModelLoss( input_blob, output_weights_blob, positions_blob, label_id_blob, label_weight_blob, seq_length, hidden_size, vocab_size, max_predictions_per_seq, hidden_act, initializer_range, ): with flow.scope.namespace("other"): sum_label_weight_blob = flow.math.reduce_sum(label_weight_blob, axis=[-1]) ones = sum_label_weight_blob * 0.0 + 1.0 sum_label_weight_blob = flow.math.reduce_sum(sum_label_weight_blob) batch_size = flow.math.reduce_sum(ones) sum_label_weight_blob = sum_label_weight_blob / batch_size with flow.scope.namespace("cls-predictions"): input_blob = _GatherIndexes(input_blob, positions_blob, seq_length, hidden_size) with flow.scope.namespace("transform"): if callable(hidden_act): act_fn = op_conf_util.kNone else: act_fn = hidden_act input_blob = bert_util._FullyConnected( input_blob, input_size=hidden_size, units=hidden_size, activation=act_fn, weight_initializer=bert_util.CreateInitializer( initializer_range), name="dense", ) if callable(hidden_act): input_blob = hidden_act(input_blob) input_blob = bert_util._LayerNorm(input_blob, hidden_size) output_bias = flow.get_variable( name="output_bias", shape=[vocab_size], dtype=input_blob.dtype, initializer=flow.constant_initializer(1.0), ) logit_blob = flow.matmul(input_blob, output_weights_blob, transpose_b=True) logit_blob = flow.nn.bias_add(logit_blob, output_bias) label_id_blob = flow.reshape(label_id_blob, [-1]) pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits( logits=logit_blob, labels=label_id_blob) pre_example_loss = flow.reshape(pre_example_loss, [-1, max_predictions_per_seq]) numerator = pre_example_loss * label_weight_blob with flow.scope.namespace("loss"): numerator = flow.math.reduce_sum(numerator, axis=[-1]) denominator = sum_label_weight_blob + 1e-5 loss = numerator / denominator return loss, pre_example_loss, logit_blob
def _AttentionLayer( from_blob, to_blob, attention_mask_blob, num_attention_heads=1, size_per_head=512, query_act=op_conf_util.kNone, key_act=op_conf_util.kNone, value_act=op_conf_util.kNone, attention_probs_dropout_prob=0.0, initializer_range=0.02, do_return_2d_tensor=False, batch_size=None, from_seq_length=None, to_seq_length=None, ): def TransposeForScores(input_blob, num_attention_heads, seq_length, width): output_blob = flow.reshape( input_blob, [-1, seq_length, num_attention_heads, width] ) output_blob = flow.transpose(output_blob, perm=[0, 2, 1, 3]) return output_blob from_blob_2d = flow.reshape(from_blob, [-1, num_attention_heads * size_per_head]) to_blob_2d = flow.reshape(to_blob, [-1, num_attention_heads * size_per_head]) query_blob = _FullyConnected( from_blob_2d, input_size=num_attention_heads * size_per_head, units=num_attention_heads * size_per_head, activation=query_act, name="query", weight_initializer=CreateInitializer(initializer_range), ) key_blob = _FullyConnected( to_blob_2d, input_size=num_attention_heads * size_per_head, units=num_attention_heads * size_per_head, activation=key_act, name="key", weight_initializer=CreateInitializer(initializer_range), ) value_blob = _FullyConnected( to_blob_2d, input_size=num_attention_heads * size_per_head, units=num_attention_heads * size_per_head, activation=value_act, name="value", weight_initializer=CreateInitializer(initializer_range), ) query_blob = TransposeForScores( query_blob, num_attention_heads, from_seq_length, size_per_head ) key_blob = TransposeForScores( key_blob, num_attention_heads, to_seq_length, size_per_head ) attention_scores_blob = flow.matmul(query_blob, key_blob, transpose_b=True) attention_scores_blob = attention_scores_blob * ( 1.0 / math.sqrt(float(size_per_head)) ) attention_mask_blob = flow.reshape( attention_mask_blob, [-1, 1, from_seq_length, to_seq_length] ) attention_mask_blob = flow.cast(attention_mask_blob, dtype=flow.float) addr_blob = (attention_mask_blob - 1.0) * 10000.0 attention_scores_blob = attention_scores_blob + addr_blob attention_probs_blob = flow.nn.softmax(attention_scores_blob) attention_probs_blob = _Dropout(attention_probs_blob, attention_probs_dropout_prob) value_blob = flow.reshape( value_blob, [-1, to_seq_length, num_attention_heads, size_per_head] ) value_blob = flow.transpose(value_blob, perm=[0, 2, 1, 3]) context_blob = flow.matmul(attention_probs_blob, value_blob) context_blob = flow.transpose(context_blob, perm=[0, 2, 1, 3]) if do_return_2d_tensor: context_blob = flow.reshape( context_blob, [-1, num_attention_heads * size_per_head] ) else: context_blob = flow.reshape( context_blob, [-1, from_seq_length, num_attention_heads * size_per_head] ) return context_blob
def trt_matmul_job( a=flow.FixedTensorDef(a_shape, dtype=dtype), b=flow.FixedTensorDef(b_shape, dtype=dtype), ): return flow.matmul(a, b, transpose_a=trans_a, transpose_b=trans_b)