def test_contraction(self): desc_a = cutensor.create_tensor_descriptor(self.a) desc_b = cutensor.create_tensor_descriptor(self.b) desc_c = cutensor.create_tensor_descriptor(self.c) mode_a = cutensor.create_mode('m', 'k') mode_b = cutensor.create_mode('k', 'n') mode_c = cutensor.create_mode('m', 'n') cutensor.contraction(self.alpha, self.a, desc_a, mode_a, self.b, desc_b, mode_b, self.beta, self.c, desc_c, mode_c) cupy.testing.assert_allclose(self.c, self.c_ref, rtol=self.tol, atol=self.tol)
def reduced_binary_einsum(arr0, sub0, arr1, sub1, sub_others): set0 = set(sub0) set1 = set(sub1) assert len(set0) == len(sub0), 'operand 0 should be reduced: diagonal' assert len(set1) == len(sub1), 'operand 1 should be reduced: diagonal' if len(sub0) == 0 or len(sub1) == 0: return arr0 * arr1, sub0 + sub1 set_others = set(sub_others) shared = set0 & set1 batch_dims = shared & set_others contract_dims = shared - batch_dims bs0, cs0, ts0 = _make_transpose_axes(sub0, batch_dims, contract_dims) bs1, cs1, ts1 = _make_transpose_axes(sub1, batch_dims, contract_dims) sub_b = [sub0[axis] for axis in bs0] assert sub_b == [sub1[axis] for axis in bs1] sub_l = [sub0[axis] for axis in ts0] sub_r = [sub1[axis] for axis in ts1] sub_out = sub_b + sub_l + sub_r assert set(sub_out) <= set_others, 'operands should be reduced: unary sum' if len(contract_dims) == 0: # Use element-wise multiply when no contraction is needed if len(sub_out) == len(sub_others): # to assure final output of einsum is C-contiguous sub_out = sub_others arr0 = _expand_dims_transpose(arr0, sub0, sub_out) arr1 = _expand_dims_transpose(arr1, sub1, sub_out) return arr0 * arr1, sub_out for accelerator in _accelerator.get_routine_accelerators(): if accelerator == _accelerator.ACCELERATOR_CUTENSOR: if _use_cutensor(arr0.dtype, sub0, arr1.dtype, sub1, batch_dims, contract_dims): if len(sub_out) == len(sub_others): # to assure final output of einsum is C-contiguous sub_out = sub_others out_shape = _get_out_shape(arr0.shape, sub0, arr1.shape, sub1, sub_out) arr_out = cupy.empty(out_shape, arr0.dtype) arr0 = cupy.ascontiguousarray(arr0) arr1 = cupy.ascontiguousarray(arr1) desc_0 = cutensor.create_tensor_descriptor(arr0) desc_1 = cutensor.create_tensor_descriptor(arr1) desc_out = cutensor.create_tensor_descriptor(arr_out) arr_out = cutensor.contraction(1.0, arr0, desc_0, sub0, arr1, desc_1, sub1, 0.0, arr_out, desc_out, sub_out) return arr_out, sub_out tmp0, shapes0 = _flatten_transpose(arr0, [bs0, ts0, cs0]) tmp1, shapes1 = _flatten_transpose(arr1, [bs1, cs1, ts1]) shapes_out = shapes0[0] + shapes0[1] + shapes1[2] assert shapes0[0] == shapes1[0] arr_out = cupy.matmul(tmp0, tmp1).reshape(shapes_out) return arr_out, sub_out
def __imul__(self, rhs: Any) -> "Tensor": if isinstance(rhs, Number) or isinstance(rhs, xp.ndarray): self._data *= rhs elif isinstance(rhs, Tensor): axes = getEinsumRule(self._indices, rhs._indices) res_indices = ([ idx for i, idx in enumerate(self._indices) if i not in axes[0] ] + [ idx for j, idx in enumerate(rhs._indices) if j not in axes[1] ]) if not self.use_cutensor: self._data = xp.tensordot(self._data, rhs._data, axes=axes) else: a = xp.ascontiguousarray(self._data) b = xp.ascontiguousarray(rhs._data) c = xp.zeros([idx.size for idx in res_indices]) desc_a = cutensor.create_tensor_descriptor(a) desc_b = cutensor.create_tensor_descriptor(b) desc_c = cutensor.create_tensor_descriptor(c) mode_a = [chr(97 + i) for i in range(self._rank)] mode_b = [ chr(97 + i) for i in range(self._rank, self._rank + rhs._rank) ] for i, j in zip(axes[0], axes[1]): mode_b[j] = mode_a[i] mode_c = ( [mode_a[i] for i in range(self._rank) if i not in axes[0]] + [mode_b[j] for j in range(rhs._rank) if j not in axes[1]]) mode_a = cutensor.create_mode(*mode_a) mode_b = cutensor.create_mode(*mode_b) mode_c = cutensor.create_mode(*mode_c) cutensor.contraction(1.0, a, desc_a, mode_a, b, desc_b, mode_b, 0.0, c, desc_c, mode_c) self._data = c self._indices = res_indices self._rank = len(self._indices) else: msg = f"Unsupported __imul__ with rhs of type {type(rhs)}" logger.error(msg) raise RuntimeError(msg) return self
def reduced_binary_einsum(arr0, sub0, arr1, sub1, sub_others): set0 = set(sub0) set1 = set(sub1) assert len(set0) == len(sub0), 'operand 0 should be reduced: diagonal' assert len(set1) == len(sub1), 'operand 1 should be reduced: diagonal' if len(sub0) == 0 or len(sub1) == 0: return arr0 * arr1, sub0 + sub1 set_others = set(sub_others) shared = set0 & set1 batch_dims = shared & set_others contract_dims = shared - batch_dims bs0, cs0, ts0 = _make_transpose_axes(sub0, batch_dims, contract_dims) bs1, cs1, ts1 = _make_transpose_axes(sub1, batch_dims, contract_dims) sub_b = [sub0[axis] for axis in bs0] assert sub_b == [sub1[axis] for axis in bs1] sub_l = [sub0[axis] for axis in ts0] sub_r = [sub1[axis] for axis in ts1] sub_out = sub_b + sub_l + sub_r assert set(sub_out) <= set_others, 'operands should be reduced: unary sum' if _use_cutensor(arr0.dtype, sub0, arr1.dtype, sub1, batch_dims, contract_dims): if len(sub_out) == len(sub_others): sub_out = sub_others out_shape = _get_out_shape(arr0.shape, sub0, arr1.shape, sub1, sub_out) arr_out = cupy.empty(out_shape, arr0.dtype) arr0 = cupy.ascontiguousarray(arr0) arr1 = cupy.ascontiguousarray(arr1) desc_0 = cutensor.create_tensor_descriptor(arr0) desc_1 = cutensor.create_tensor_descriptor(arr1) desc_out = cutensor.create_tensor_descriptor(arr_out) arr_out = cutensor.contraction(1.0, arr0, desc_0, sub0, arr1, desc_1, sub1, 0.0, arr_out, desc_out, sub_out) return arr_out, sub_out tmp0, shapes0 = _flatten_transpose(arr0, [bs0, ts0, cs0]) tmp1, shapes1 = _flatten_transpose(arr1, [bs1, cs1, ts1]) shapes_out = shapes0[0] + shapes0[1] + shapes1[2] assert shapes0[0] == shapes1[0] arr_out = cupy.matmul(tmp0, tmp1).reshape(shapes_out) return arr_out, sub_out
def test_contraction(self): desc_a = cutensor.create_tensor_descriptor(self.a) desc_b = cutensor.create_tensor_descriptor(self.b) desc_c = cutensor.create_tensor_descriptor(self.c) d = cutensor.contraction(self.alpha, self.a, desc_a, self.mode_a, self.b, desc_b, self.mode_b, self.beta, self.c, desc_c, self.mode_c) assert self.c is d testing.assert_allclose( self.alpha * self.a_transposed * self.b_transposed + self.beta * self.c_transposed, d, rtol=self.tol, atol=self.tol)
def test_contraction(self): compute_capability = int(device.get_compute_capability()) if compute_capability < 70 and self.dtype == numpy.float16: self.skipTest('Not supported.') desc_a = cutensor.create_tensor_descriptor(self.a) desc_b = cutensor.create_tensor_descriptor(self.b) desc_c = cutensor.create_tensor_descriptor(self.c) d = cutensor.contraction(self.alpha, self.a, desc_a, self.mode_a, self.b, desc_b, self.mode_b, self.beta, self.c, desc_c, self.mode_c) assert self.c is d testing.assert_allclose( self.alpha * self.a_transposed * self.b_transposed + self.beta * self.c_transposed, d, rtol=self.tol, atol=self.tol)
def my_matmul(a, b): return cutensor.contraction(1.0, a, desc_a, mode_a, b, desc_b, mode_b, 0.0, c, desc_c, mode_c)
a = cupy.random.random([extent[i] for i in mode_a]) b = cupy.random.random([extent[i] for i in mode_b]) c = cupy.random.random([extent[i] for i in mode_c]) a = a.astype(dtype) b = b.astype(dtype) c = c.astype(dtype) desc_a = cutensor.create_tensor_descriptor(a) desc_b = cutensor.create_tensor_descriptor(b) desc_c = cutensor.create_tensor_descriptor(c) alpha = 1.1 beta = 1.0 # rehearsal c = cutensor.contraction(alpha, a, desc_a, mode_a, b, desc_b, mode_b, beta, c, desc_c, mode_c) ev_start = stream.Event() ev_end = stream.Event() st = stream.Stream() with st: # measurement ev_start.record() c = cutensor.contraction(alpha, a, desc_a, mode_a, b, desc_b, mode_b, beta, c, desc_c, mode_c) ev_end.record() st.synchronize() elapsed_ms = stream.get_elapsed_time(ev_start, ev_end) total_flops = 2 * numpy.prod(numpy.array(list(extent.values())))