def positional_encoding(position, d_model, name="positional_encoding"): """ Do positional encoding :param position: The position :param d_model: The hidden dimension in model :return: shape like (1, position, d_model) """ with flow.scope.namespace(name): # shape = (position, 1) input_pos = flow.expand_dims(flow.range(position, dtype=flow.float32, name="pos"), axis=1) # shape = (1, d_model) input_d_model = flow.expand_dims(flow.range(d_model, dtype=flow.float32, name="d_model"), axis=0) angle_rads = get_angles(input_pos, input_d_model, d_model) # Get a even range like (0, 2, 4, 6, ....., d_model) even_range = flow.range(0, d_model, 2, dtype=flow.int32, name="even_range") # Do the sin in even indexes even_out = flow.math.sin(flow.gather(angle_rads, even_range, axis=1)) # Get a odd range like (1, 3, 5, 7, ....., d_model) odd_range = flow.range(1, d_model, 2, dtype=flow.int32, name="odd_range") # Do the cos in odd indexes odd_out = flow.math.cos(flow.gather(angle_rads, odd_range, axis=1)) # Initialize Position encode constant position_encode = flow.constant(0, dtype=flow.float32, shape=(d_model, position), name="pos_ende") # Due to the scatter only support row indexes, we need to transpose even_out = flow.tensor_scatter_nd_update(position_encode, flow.expand_dims(even_range, axis=1), flow.transpose(even_out, perm=[1, 0])) odd_out = flow.tensor_scatter_nd_update(position_encode, flow.expand_dims(odd_range, axis=1), flow.transpose(odd_out, perm=[1, 0])) # Add even indexes value and odd indexes value out = even_out + odd_out # Because We have transposed in even_out and odd_out, So we need to transpose back out = flow.transpose(out, perm=[1, 0]) # Expand dims in dim=0, we get shape like (1, position, d_model) out = flow.expand_dims(out, axis=0) return out
def scatter_nd_update_grad_fn( x_def: oft.Numpy.Placeholder(params.shape, dtype=flow.float), indices_def: oft.Numpy.Placeholder(indices.shape, dtype=flow.int32), y_def: oft.Numpy.Placeholder(updates.shape, dtype=flow.float), ): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "params", shape=params.shape, dtype=flow.float32, initializer=flow.constant_initializer(0), ) y = flow.get_variable( "updates", shape=updates.shape, dtype=flow.float32, initializer=flow.constant_initializer(0), ) x = x + x_def y = y + y_def z = flow.tensor_scatter_nd_update(x, indices_def, y) flow.losses.add_loss(z) flow.watch_diff(x, compare_dz_dx) flow.watch_diff(y, compare_dz_dy) return z
def tensor_scatter_nd_update_fn( params_def: oft.ListNumpy.Placeholder(params.shape, dtype=flow.float), indices_def: oft.ListNumpy.Placeholder(indices_static_shape, dtype=flow.int32), updates_def: oft.ListNumpy.Placeholder(updates_static_shape, dtype=flow.float), ): with flow.scope.placement("gpu", "0:0"): return flow.tensor_scatter_nd_update(params_def, indices_def, updates_def)
def scatter_nd_update_grad_fn( x_def: oft.Numpy.Placeholder(params.shape, dtype=flow.float), indices_def: oft.Numpy.Placeholder(indices.shape, dtype=flow.int32), y_def: oft.Numpy.Placeholder(updates.shape, dtype=flow.float), ): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "params", shape=params.shape, dtype=flow.float32, initializer=flow.constant_initializer(0), ) y = flow.get_variable( "updates", shape=updates.shape, dtype=flow.float32, initializer=flow.constant_initializer(0), ) x = x + x_def y = y + y_def z = flow.tensor_scatter_nd_update(x, indices_def, y) flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [1e-3]), momentum=0 ).minimize(z) flow.watch_diff(x, compare_dz_dx) flow.watch_diff(y, compare_dz_dy) return z
def _test_eager_global_tensor_scatter_nd_update_backward(test_case, placement, sbp): origin = random_tensor(1, 16,).to_global(placement, sbp) origin.retain_grad() indices = choice_tensor(16, (8, 1), replace=False).to_global( placement, [flow.sbp.broadcast for _ in range(len(placement.ranks.shape))] ) update = random_tensor(1, 8).to_global( placement, [flow.sbp.broadcast for _ in range(len(placement.ranks.shape))] ) update.retain_grad() np_origin = origin.oneflow.numpy() np_indices = indices.oneflow.numpy().reshape(8) np_update = update.oneflow.numpy() np_update_grad = np.ones(8) np_origin_grad = np.ones(16) np_origin_grad[np_indices] = np.zeros(8) output = flow.tensor_scatter_nd_update( origin.oneflow, indices.oneflow, update.oneflow ) out_sum = output.sum() out_sum.backward() np_origin[np_indices] = np_update test_case.assertTrue(np.allclose(output.numpy(), np_origin, 0.0001, 0.0001)) test_case.assertTrue(np.allclose(update.oneflow.grad.numpy(), np_update_grad)) test_case.assertTrue(np.allclose(origin.oneflow.grad.numpy(), np_origin_grad))
def _test_global_tensor_scatter_nd_update_t( test_case, placement, sbp, check_graph=False ): origin = random_tensor(2, 16, 4, requires_grad=False).to_global(placement, sbp) indices = choice_tensor(16, (8, 1), replace=False).to_global( placement, [flow.sbp.broadcast for _ in range(len(placement.ranks.shape))] ) update = random_tensor(2, 8, 4, requires_grad=False).to_global( placement, [flow.sbp.broadcast for _ in range(len(placement.ranks.shape))] ) np_origin = origin.oneflow.numpy() np_indices = indices.oneflow.numpy().reshape(8) np_update = update.oneflow.numpy() if check_graph: tensor_scatter_nd_update = TensorScatterNdUpdate() output = tensor_scatter_nd_update( origin.oneflow, indices.oneflow, update.oneflow ) else: output = flow.tensor_scatter_nd_update( origin.oneflow, indices.oneflow, update.oneflow ) np_origin[np_indices] = np_update test_case.assertTrue(np.allclose(output.numpy(), np_origin, 0.0001, 0.0001))
def _test_tensor_scatter_nd_update_backward(test_case, device): origin = flow.tensor( np.arange(8), dtype=flow.float, device=flow.device(device), requires_grad=True, ) indices = flow.tensor(np.array([[1], [6], [4]]), dtype=flow.int, device=flow.device(device)) of_update = flow.tensor( np.array([10.2, 5.1, 12.7]), requires_grad=True, dtype=flow.float, device=flow.device(device), ) np_out = np.array([0.0, 10.2, 2.0, 3.0, 12.7, 5.0, 5.1, 7.0]) np_update_grad = np.array([1.0, 1.0, 1.0]) np_origin_grad = np.array([1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0]) output = flow.tensor_scatter_nd_update(origin, indices, of_update) out_sum = output.sum() out_sum.backward() test_case.assertTrue(np.allclose(output.numpy(), np_out, 0.0001, 0.0001)) test_case.assertTrue(np.allclose(of_update.grad.numpy(), np_update_grad)) test_case.assertTrue(np.allclose(origin.grad.numpy(), np_origin_grad))
def test_tensor_scatter_nd_update_runtime_error(test_case): with test_case.assertRaises(Exception) as context: x = flow.arange(8, dtype=flow.float32, requires_grad=True) indices = flow.tensor([[1], [3], [5]]) updates = flow.tensor([-1, -2, -3], dtype=flow.float64, requires_grad=True) y = flow.tensor_scatter_nd_update(x, indices, updates) test_case.assertTrue("The dtype of tensor and updates must be same." in str(context.exception))
def _test_tensor_scatter_nd_update(test_case, device): origin = flow.tensor(np.arange(8), dtype=flow.float, device=flow.device(device)) indices = flow.tensor( np.array([[1], [6], [4]]), dtype=flow.int, device=flow.device(device) ) update = flow.tensor( np.array([10.2, 5.1, 12.7]), dtype=flow.float, device=flow.device(device) ) np_out = np.array([0.0, 10.2, 2.0, 3.0, 12.7, 5.0, 5.1, 7.0]) output = flow.tensor_scatter_nd_update(origin, indices, update) test_case.assertTrue(np.allclose(output.numpy(), np_out, 0.0001, 0.0001))
def _test_tensor_scatter_nd_update_t(test_case, device): origin = flow.tensor(np.arange(15).reshape(5, 3), dtype=flow.float, device=flow.device(device)) indices = flow.tensor(np.array([[0], [4], [2]]), dtype=flow.int, device=flow.device(device)) update = flow.tensor( np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]]), dtype=flow.float, device=flow.device(device), ) np_out = np.array([ [1.0, 1.0, 1.0], [3.0, 4.0, 5.0], [3.0, 3.0, 3.0], [9.0, 10.0, 11.0], [2.0, 2.0, 2.0], ]) output = flow.tensor_scatter_nd_update(origin, indices, update) test_case.assertTrue(np.allclose(output.numpy(), np_out, 0.0001, 0.0001))
def build(self, origin, indices, update): return flow.tensor_scatter_nd_update(origin, indices, update)