def _test_gather_random_array(test_case, device): input = np.random.randn(3, 4, 3, 5) index = np.random.choice(np.arange(3), size=180, replace=True).reshape( (3, 4, 3, 5)) np_out = np.take_along_axis(input, index, 1) output = flow.gather( flow.tensor(input, dtype=flow.float32, device=flow.device(device)), 1, flow.tensor(index, dtype=flow.int, device=flow.device(device)), ) test_case.assertTrue(np.allclose(output.numpy(), np_out)) np_out2 = np.take_along_axis(input, index, 2) output2 = flow.gather( flow.tensor(input, dtype=flow.float32, device=flow.device(device)), 2, flow.tensor(index, dtype=flow.int, device=flow.device(device)), ) test_case.assertTrue(np.allclose(output2.numpy(), np_out2)) np_out3 = np.take_along_axis(input, index, 3) output3 = flow.gather( flow.tensor(input, dtype=flow.float32, device=flow.device(device)), 3, flow.tensor(index, dtype=flow.int, device=flow.device(device)), ) test_case.assertTrue(np.allclose(output3.numpy(), np_out3))
def _model(dense_fields, wide_sparse_fields, deep_sparse_fields): wide_sparse_fields = flow.parallel_cast( wide_sparse_fields, distribute=flow.distribute.broadcast()) wide_embedding_table = flow.get_variable( name='wide_embedding', shape=(FLAGS.wide_vocab_size, 1), initializer=flow.random_uniform_initializer(minval=-0.05, maxval=0.05), distribute=flow.distribute.split(0), ) wide_embedding = flow.gather(params=wide_embedding_table, indices=wide_sparse_fields) wide_embedding = flow.reshape(wide_embedding, shape=(-1, wide_embedding.shape[-1] * wide_embedding.shape[-2])) wide_scores = flow.math.reduce_sum(wide_embedding, axis=[1], keepdims=True) wide_scores = flow.parallel_cast( wide_scores, distribute=flow.distribute.split(0), gradient_distribute=flow.distribute.broadcast()) deep_sparse_fields = flow.parallel_cast( deep_sparse_fields, distribute=flow.distribute.broadcast()) deep_embedding_table = flow.get_variable( name='deep_embedding', shape=(FLAGS.deep_vocab_size, FLAGS.deep_embedding_vec_size), initializer=flow.random_uniform_initializer(minval=-0.05, maxval=0.05), distribute=flow.distribute.split(1), ) deep_embedding = flow.gather(params=deep_embedding_table, indices=deep_sparse_fields) deep_embedding = flow.parallel_cast( deep_embedding, distribute=flow.distribute.split(0), gradient_distribute=flow.distribute.split(2)) deep_embedding = flow.reshape(deep_embedding, shape=(-1, deep_embedding.shape[-1] * deep_embedding.shape[-2])) deep_features = flow.concat([deep_embedding, dense_fields], axis=1) for idx, units in enumerate(DEEP_HIDDEN_UNITS): deep_features = flow.layers.dense( deep_features, units=units, kernel_initializer=flow.glorot_uniform_initializer(), bias_initializer=flow.constant_initializer(0.0), activation=flow.math.relu, name='fc' + str(idx + 1)) deep_features = flow.nn.dropout(deep_features, rate=FLAGS.deep_dropout_rate) deep_scores = flow.layers.dense( deep_features, units=1, kernel_initializer=flow.glorot_uniform_initializer(), bias_initializer=flow.constant_initializer(0.0), name='fc' + str(len(DEEP_HIDDEN_UNITS) + 1)) scores = wide_scores + deep_scores return scores
def sort_op(input, dim: int = -1, descending: bool = False): num_dims = len(input.shape) dim = dim if dim >= 0 else dim + num_dims direction = "DESCENDING" if descending else "ASCENDING" assert 0 <= dim < num_dims, "dim out of range" if dim == num_dims - 1: indices = flow._C.arg_sort(input, direction) return (flow.gather(input, dim, indices), indices) else: perm = get_perm_when_transpose_axis_to_last_dim(num_dims, dim) x = flow._C.transpose(input, perm=perm) indices = flow._C.arg_sort(x, direction) indices = flow._C.transpose(indices, perm=get_inversed_perm(perm)) return (flow.gather(input, dim, indices), indices)
def positional_encoding(position, d_model, name="positional_encoding"): """ Do positional encoding :param position: The position :param d_model: The hidden dimension in model :return: shape like (1, position, d_model) """ with flow.scope.namespace(name): # shape = (position, 1) input_pos = flow.expand_dims(flow.range(position, dtype=flow.float32, name="pos"), axis=1) # shape = (1, d_model) input_d_model = flow.expand_dims(flow.range(d_model, dtype=flow.float32, name="d_model"), axis=0) angle_rads = get_angles(input_pos, input_d_model, d_model) # Get a even range like (0, 2, 4, 6, ....., d_model) even_range = flow.range(0, d_model, 2, dtype=flow.int32, name="even_range") # Do the sin in even indexes even_out = flow.math.sin(flow.gather(angle_rads, even_range, axis=1)) # Get a odd range like (1, 3, 5, 7, ....., d_model) odd_range = flow.range(1, d_model, 2, dtype=flow.int32, name="odd_range") # Do the cos in odd indexes odd_out = flow.math.cos(flow.gather(angle_rads, odd_range, axis=1)) # Initialize Position encode constant position_encode = flow.constant(0, dtype=flow.float32, shape=(d_model, position), name="pos_ende") # Due to the scatter only support row indexes, we need to transpose even_out = flow.tensor_scatter_nd_update(position_encode, flow.expand_dims(even_range, axis=1), flow.transpose(even_out, perm=[1, 0])) odd_out = flow.tensor_scatter_nd_update(position_encode, flow.expand_dims(odd_range, axis=1), flow.transpose(odd_out, perm=[1, 0])) # Add even indexes value and odd indexes value out = even_out + odd_out # Because We have transposed in even_out and odd_out, So we need to transpose back out = flow.transpose(out, perm=[1, 0]) # Expand dims in dim=0, we get shape like (1, position, d_model) out = flow.expand_dims(out, axis=0) return out
def _GatherIndexes(sequence_blob, positions_blob, seq_length, hidden_size): output = flow.gather(params=sequence_blob, indices=positions_blob, axis=2, batch_dims=2) output = flow.reshape(output, [-1, hidden_size]) return output
def test_gather_dim_value_runtime_error(test_case): with test_case.assertRaises(Exception) as context: x1 = flow.ones((2, 2), dtype=flow.float32, requires_grad=True) x2 = flow.ones((2, 2), dtype=flow.int64) y = flow.gather(x1, 2, x2) test_case.assertTrue( "Dimension out of range" in str(context.exception))
def test_gather_size_runtime_error(test_case): with test_case.assertRaises(Exception) as context: x1 = flow.ones((2, 2), dtype=flow.float32, requires_grad=True) x2 = flow.ones((4, 2), dtype=flow.int64) y = flow.gather(x1, 1, x2) test_case.assertTrue( "Size does not match at dimension" in str(context.exception))
def get_masked_lm_loss( logit_blob, masked_lm_positions, masked_lm_labels, label_weights, max_prediction_per_seq, ): # gather valid position indices logit_blob = flow.gather( logit_blob, index=masked_lm_positions.unsqueeze(2).repeat( 1, 1, args.vocab_size), dim=1, ) logit_blob = flow.reshape(logit_blob, [-1, args.vocab_size]) label_id_blob = flow.reshape(masked_lm_labels, [-1]) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. pre_example_loss = mlm_criterion(logit_blob, label_id_blob) pre_example_loss = flow.reshape(pre_example_loss, [-1, max_prediction_per_seq]) numerator = flow.sum(pre_example_loss * label_weights) denominator = flow.sum(label_weights) + 1e-5 loss = numerator / denominator return loss
def get_masked_lm_loss( logit_blob, masked_lm_positions, masked_lm_labels, label_weights, max_prediction_per_seq=20, ): # gather valid position indices logit_blob = flow.gather( logit_blob, index=masked_lm_positions.unsqueeze(2).repeat(1, 1, 30522), dim=1, ) logit_blob = flow.reshape(logit_blob, [-1, 30522]) label_id_blob = flow.reshape(masked_lm_labels, [-1]) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. pre_example_loss = nn.CrossEntropyLoss(reduction="none")(logit_blob, label_id_blob) pre_example_loss = flow.reshape(pre_example_loss, [-1, max_prediction_per_seq]) sum_label_weight = flow.sum(label_weights, dim=-1) sum_label_weight = sum_label_weight / label_weights.shape[0] numerator = flow.sum(pre_example_loss * label_weights) denominator = flow.sum(label_weights) + 1e-5 loss = numerator / denominator return logit_blob, loss
def shuffle( value: remote_blob_util.BlobDef, seed: Optional[int] = None, name: Optional[str] = None, ) -> remote_blob_util.BlobDef: return flow.gather(value, generate_random_batch_permutation_indices(value, seed))
def test_gather_index_type_runtime_error(test_case): with test_case.assertRaises(Exception) as context: x1 = flow.ones((2, 2), dtype=flow.float32, requires_grad=True) x2 = flow.ones((2, 2), dtype=flow.float32) y = flow.gather(x1, 1, x2) test_case.assertTrue( "gather(): Expected dtype int32 or int64 for index" in str( context.exception))
def gather_model_parallel_fw_job( params: oft.Numpy.Placeholder(params_shape, dtype=flow.float), indices: oft.Numpy.Placeholder(indices_shape, dtype=flow.int32), ): with flow.scope.placement(device_type, "0:0-3"): params = params.with_distribute(flow.distribute.split(split_axis)) indices = indices.with_distribute(flow.distribute.broadcast()) return flow.gather(params=params, indices=indices, axis=axis)
def _test_gather_input_0dim_tensor(test_case, device): input = flow.tensor(1.0).to(device) input.requires_grad = True index = flow.tensor([0]).to(device) output = flow.gather(input, 0, index) test_case.assertTrue(np.array_equal(output.numpy(), [1.0])) output.sum().backward() test_case.assertTrue(np.array_equal(input.grad.numpy(), 1.0))
def test_gather_dim_equal_runtime_error(test_case): with test_case.assertRaises(Exception) as context: x1 = flow.ones((2, 2), dtype=flow.float32, requires_grad=True) x2 = flow.ones((2, 2, 2), dtype=flow.int64) y = flow.gather(x1, 1, x2) test_case.assertTrue( "Index tensor must have the same number of dimensions as input tensor" in str(context.exception))
def _prob_in_top_k( self, clean_values, noisy_values, noise_stddev, noisy_top_values ): """Helper function to NoisyTopKGating. Computes the probability that value is in top k, given different random noise. This gives us a way of backpropagating from a loss that balances the number of times each expert is in the top k experts per example. In the case of no noise, pass in None for noise_stddev, and the result will not be differentiable. Args: clean_values: a `Tensor` of shape [batch, n]. noisy_values: a `Tensor` of shape [batch, n]. Equal to clean values plus normally distributed noise with standard deviation noise_stddev. noise_stddev: a `Tensor` of shape [batch, n], or None noisy_top_values: a `Tensor` of shape [batch, m]. "values" Output of tf.top_k(noisy_top_values, m). m >= k+1 Returns: a `Tensor` of shape [batch, n]. """ batch = clean_values.size(0) m = noisy_top_values.size(1) top_values_flat = noisy_top_values.flatten() threshold_positions_if_in = ( flow.arange(batch, device=noisy_values.device) * m + self.k ) threshold_if_in = flow.unsqueeze( flow.gather(top_values_flat, 0, threshold_positions_if_in), 1 ) is_in = flow.gt(noisy_values, threshold_if_in) threshold_positions_if_out = threshold_positions_if_in - 1 threshold_if_out = flow.unsqueeze( flow.gather(top_values_flat, 0, threshold_positions_if_out), 1 ) # is each value currently in the top k. prob_if_in = cdf((clean_values - threshold_if_in) / noise_stddev) prob_if_out = cdf((clean_values - threshold_if_out) / noise_stddev) prob = flow.where(is_in, prob_if_in, prob_if_out) return prob
def _EmbeddingLookup(input_ids_blob, vocab_size, embedding_size=128, initializer_range=0.02, word_embedding_name="word_embeddings"): embedding_table = flow.get_variable(name=word_embedding_name, shape=[vocab_size, embedding_size], dtype=flow.float, initializer=CreateInitializer(initializer_range)) output = flow.gather(params=embedding_table, indices=input_ids_blob, axis=0) return output, embedding_table
def _test_gather(test_case, device): input = np.array([[1, 2], [3, 4]]) index = np.array([[0, 0], [1, 0]]) np_out = np.take_along_axis(input, index, 0) output = flow.gather( flow.tensor(input, dtype=flow.float32, device=flow.device(device)), 0, flow.tensor(index, dtype=flow.int, device=flow.device(device)), ) test_case.assertTrue(np.array_equal(output.numpy(), np_out))
def flip(x, dim): xsize = x.size() dim = x.dim() + dim if dim < 0 else dim x = flow.reshape(x, shape=(-1, *xsize[dim:])) x = flow.reshape(x, shape=(x.size(0), x.size(1), -1)) index = [] index1 = [] for i in range(x.size(1) - 1, -1, -1): index1.append([i] * x.size(2)) for i in range(x.size(0)): index.append(index1) index = flow.Tensor(index).long() if x.is_cuda: x = flow.gather(x, index.to("cuda"), dim=1) else: x = flow.gather(x, index, dim=1) return flow.reshape(x, shape=xsize)
def forward(self, sequence_output, masked_lm_positions): # Gather masked outputs masked_sequence_output = flow.gather( sequence_output, index=masked_lm_positions.unsqueeze(2).expand(-1, -1, self.hidden_size), dim=1, ) masked_sequence_output = masked_sequence_output.reshape([-1, self.hidden_size]) masked_sequence_output = self.transform(masked_sequence_output) masked_sequence_output = self.decoder(masked_sequence_output) return masked_sequence_output
def do_gather(x_blob, i_blob): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "params", shape=params.shape, dtype=flow.float32, initializer=flow.constant_initializer(0), ) x = x + x_blob y = flow.gather(x, i_blob, axis=axis, batch_dims=batch_dims) flow.losses.add_loss(y) flow.watch_diff(x, compare_fn) return y
def variable_scope_test_job_2(a=of.FixedTensorDef((2, 5))): with of.scope.namespace("job2_scope1"): indices = of.get_variable( "gather_inds", shape=(2, ), dtype=of.int32, initializer=of.constant_initializer(1), trainable=False, ) output = of.gather(a, indices, axis=1) print("indices op name: ", indices.op_name) print("gather op name: ", output.op_name) return output
def forward(self, input): if self.dim == None: self.dim = -1 num_axes = len(input.shape) axis = self.dim if self.dim >= 0 else self.dim + num_axes assert 0 <= axis < num_axes, "axis out of range" if axis == num_axes - 1: if self.largest: indices = flow._C.top_k(input, self.k) else: neg_input = flow.mul(input, -1) indices = flow._C.top_k(neg_input, self.k) return (flow.gather(input, axis, indices), indices) else: perm = get_perm_when_transpose_axis_to_last_dim(num_axes, axis) x = flow._C.transpose(input, perm=perm) if self.largest: indices = flow._C.top_k(x, self.k) else: neg_input = flow.mul(x, -1) indices = flow._C.top_k(neg_input, self.k) indices = flow._C.transpose(indices, perm=get_inversed_perm(perm)) return (flow.gather(input, axis, indices), indices)
def _test_gather_backward(test_case, device): input = np.array([[1, 2], [3, 4]]) index = np.array([[0, 0], [1, 0]]) np_out = np.take_along_axis(input, index, 0) np_grad = _scatter_add_numpy(np.ones_like(np_out), 0, index, input.shape) of_input = flow.tensor( input, dtype=flow.float32, requires_grad=True, device=flow.device(device) ) output = flow.gather( of_input, 0, flow.tensor(index, dtype=flow.int64, device=flow.device(device)), ) out_sum = output.sum() out_sum.backward() test_case.assertTrue(np.array_equal(output.numpy(), np_out)) test_case.assertTrue(np.array_equal(of_input.grad.numpy(), np_grad))
def do_gather(x_blob, i_blob): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "params", shape=params.shape, dtype=flow.float32, initializer=flow.constant_initializer(0), ) x = x + x_blob y = flow.gather(x, i_blob, axis=axis, batch_dims=batch_dims) lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [1e-3]) flow.optimizer.SGD(lr_scheduler, momentum=0).minimize(y) flow.watch_diff(x, compare_fn) return y
def _EmbeddingPostprocessor( input_blob, seq_length, embedding_size, use_token_type=False, token_type_ids_blob=None, token_type_vocab_size=16, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1, ): output = input_blob if use_token_type: assert token_type_ids_blob is not None token_type_table = flow.get_variable( name=token_type_embedding_name, shape=[token_type_vocab_size, embedding_size], dtype=input_blob.dtype, initializer=CreateInitializer(initializer_range), ) token_type_embeddings = flow.gather( params=token_type_table, indices=token_type_ids_blob, axis=0 ) output = output + token_type_embeddings if use_position_embeddings: position_table = flow.get_variable( name=position_embedding_name, shape=[1, max_position_embeddings, embedding_size], dtype=input_blob.dtype, initializer=CreateInitializer(initializer_range), ) assert seq_length <= max_position_embeddings if seq_length != max_position_embeddings: position_table = flow.slice( position_table, begin=[None, 0, 0], size=[None, seq_length, -1] ) output = output + position_table output = _LayerNorm(output, embedding_size) output = _Dropout(output, dropout_prob) return output
def _test_batch_gather(test_case, shape, device): # for example: shape = (3, 2, 2) x = np.random.randn(*shape) x_tensor = flow.Tensor(x).to(device) x_tensor.requires_grad = True batchsize = x.shape[0] init_index = np.array([ np.random.randint(batchsize) for i in range(batchsize) ]).astype(np.int64) batch_gather_index = flow.tensor(init_index).to(device) batch_gather_out = flow.batch_gather(x_tensor, batch_gather_index) x_tensor_gather = flow.Tensor(x).to(device) x_tensor_gather.requires_grad = True reshaped_shape = [batchsize] # reshaped_shape = [3] for i in range(len(x.shape) - 1): reshaped_shape.append(1) # reshaped_shape = [3] -> [3, 1, 1] gather_index = np.reshape(init_index, reshaped_shape) gather_index = np.broadcast_to(gather_index, shape).astype( np.int64) # [3, 1, 1] -> [3, 2, 2] gather_index = flow.tensor(gather_index).to(device) gather_out = flow.gather(x_tensor_gather, 0, gather_index) total_out = batch_gather_out.sum() + gather_out.sum() total_out.backward() test_case.assertTrue( np.allclose(batch_gather_out.numpy(), gather_out.numpy(), atol=1e-4, rtol=1e-4)) test_case.assertTrue( np.allclose( x_tensor.grad.numpy(), x_tensor_gather.grad.numpy(), atol=1e-4, rtol=1e-4, )) test_case.assertTrue( np.allclose( x_tensor.grad.numpy(), x_tensor_gather.grad.numpy(), atol=1e-4, rtol=1e-4, ))
def GPT(idx, config, target=None): b, t = idx.shape assert t <= config.block_size, "Cannot forward, model block size is exhausted." #forward the GPT model #token_embeddings = flow.layers.dense word_embedding = flow.get_variable( 'word_emb', initializer=flow.random_normal_initializer(), shape=(config.vocab_size, config.n_embd)) token_embeddings = flow.gather(word_embedding, idx) #positions embedding pos_emb = flow.get_variable(name='pos_emb', shape=(1, config.block_size, config.n_embd), dtype=flow.float32, initializer=flow.zeros_initializer()) #position_embeddings = fpos_emb[:, :t, :] # each position maps to a (learnable) vector position_embeddings = flow.slice(pos_emb, [None, 0, None], [None, t, None]) x = flow.nn.dropout((token_embeddings + position_embeddings), config.embd_pdrop) #Blocks for block_id in range(config.n_layer): with flow.scope.namespace('Block' + str(block_id)): x = Block(x, config) x = flow.layers.layer_norm(x, name='output_layernorm') logits = flow.layers.dense(x, config.vocab_size, use_bias=False, activation=flow.zeros_initializer(), name='output_logits') loss = None if target is not None: #TODO logits = flow.reshape(logits, [-1, config.vocab_size]) target = flow.reshape(target, [-1]) target = flow.one_hot(target, depth=config.vocab_size, dtype=flow.float32) loss = flow.nn.softmax_cross_entropy_with_logits(logits, target) return logits, loss
def testIndexedSlicesSGD( sparse_ids: flow.typing.Numpy.Placeholder(ids_shape, dtype=flow.int32), ) -> flow.typing.Numpy: with flow.scope.placement(device_type, "0:0"): embedding_table = flow.get_variable( name="embeddings", shape=model_shape, initializer=flow.random_uniform_initializer(minval=0, maxval=100), ) embedding = flow.gather(params=embedding_table * mul_scalar, indices=sparse_ids) loss = flow.math.reduce_mean(embedding) flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]), momentum=momentum_beta, ).minimize(loss) return embedding_table
def EmbeddingLayer(input_ids_blob, vocab_size, embedding_size=128, initializer_range=0.02, word_embedding_name="Embedding_Layer"): """ Embedding Layer :param input_ids_blob:The input ID Blob :param vocab_size: The input Vocab size :param embedding_size: The embedding Size :param initializer_range: The range of Initializer, Use flow.truncated_normal :param word_embedding_name: The name of Embedding variable :return: The output and the Embedding table. """ embedding_table = flow.get_variable(name=word_embedding_name+"_Embed", shape=[vocab_size, embedding_size], dtype=flow.float32, initializer=flow.truncated_normal(initializer_range)) output = flow.gather(params=embedding_table, indices=input_ids_blob, axis=0) return output
def __call__(self, x): """ Get embeddings of x :param x: An flow.int64 Tensor with shape [batchsize, length] :return: embeddings: float32 tensor with shape [batch_size, length, embedding_size] padding: float32 tensor with shape [batch_size, length] indicating the locations of the padding tokens in x. """ with flow.scope.namespace("embedding"): embeddings = flow.gather(self.embedding_table, x, axis=0) # Scale embedding by the sqrt of the hidden size embeddings *= self.hidden_size**0.5 # Create binary array of size [batch_size, length] # where 1 = padding, 0 = not padding padding = model_utils.get_padding(x) # Set all padding embedding values to 0 embeddings *= flow.expand_dims(1 - padding, -1) return embeddings