def test_constant_pad_nd_memory_format(self, device, dtype): # Test memory format is preserved in unambiguous cases for mf, ndim in ( (torch.channels_last, 4), (torch.contiguous_format, 4), (torch.channels_last_3d, 5), (torch.contiguous_format, 5), ): a = torch.zeros([2] * ndim).to(memory_format=mf) res = refs.constant_pad_nd(a, pad=[1] * (2 * ndim)) self.assertTrue(res.is_contiguous(memory_format=mf)) # Ambiguous cases # is_channels_last_ and is_contiguous_, results in channels_last output a = torch.empty_strided((2, 1, 2, 2), stride=(4, 1, 2, 1)) self.assertTrue(a.is_contiguous(memory_format=torch.channels_last)) self.assertTrue(a.is_contiguous()) actual = refs.constant_pad_nd(a, pad=[1] * 8) expect = torch.constant_pad_nd(a, pad=[1] * 8) self.assertEqual(actual.stride(), expect.stride()) self.assertTrue( actual.is_contiguous(memory_format=torch.channels_last)) # is_channels_last_contiguous_ but not is_channels_last_, results in # contiguous output a = torch.empty_strided((2, 1, 2, 2), stride=(4, 4, 2, 1)) self.assertTrue(a.is_contiguous(memory_format=torch.channels_last)) self.assertTrue(a.is_contiguous()) actual = refs.constant_pad_nd(a, pad=[1] * 8) expect = torch.constant_pad_nd(a, pad=[1] * 8) self.assertEqual(actual.stride(), expect.stride()) self.assertTrue(actual.is_contiguous())
def my_paste_mask(mask, bbox, height, width, threshold=0.5, padding=1, contour=True, rectangle=False): # type: (Tensor, Tensor, int, int, float, int, bool, bool) -> Tensor padded_mask = torch.constant_pad_nd(mask, (padding, padding, padding, padding)) scale = 1.0 + 2.0 * float(padding) / float(mask.size(-1)) center_x = (bbox[2] + bbox[0]) * 0.5 center_y = (bbox[3] + bbox[1]) * 0.5 w_2 = (bbox[2] - bbox[0]) * 0.5 * scale h_2 = (bbox[3] - bbox[1]) * 0.5 * scale # should have two scales? bbox_scaled = torch.stack( [center_x - w_2, center_y - h_2, center_x + w_2, center_y + h_2], 0) TO_REMOVE = 1 w = (bbox_scaled[2] - bbox_scaled[0] + TO_REMOVE).clamp(min=1).long() h = (bbox_scaled[3] - bbox_scaled[1] + TO_REMOVE).clamp(min=1).long() scaled_mask = torch.ops.maskrcnn_benchmark.upsample_bilinear( padded_mask.float(), h, w) x0 = bbox_scaled[0].long() y0 = bbox_scaled[1].long() x = x0.clamp(min=0) y = y0.clamp(min=0) leftcrop = x - x0 topcrop = y - y0 w = torch.min(w - leftcrop, width - x) h = torch.min(h - topcrop, height - y) # mask = torch.zeros((height, width), dtype=torch.uint8) # mask[y:y + h, x:x + w] = (scaled_mask[topcrop:topcrop + h, leftcrop:leftcrop + w] > threshold) mask = torch.constant_pad_nd( (scaled_mask[topcrop:topcrop + h, leftcrop:leftcrop + w] > threshold), (int(x), int(width - x - w), int(y), int(height - y - h))) # int for the script compiler if contour: mask = mask.float() # poor person's contour finding by comparing to smoothed mask = (mask - torch.nn.functional.conv2d(mask.unsqueeze(0).unsqueeze(0), torch.full((1, 1, 3, 3), 1.0 / 9.0), padding=1)[0, 0]).abs() > 0.001 if rectangle: x = torch.arange(width, dtype=torch.long).unsqueeze(0) y = torch.arange(height, dtype=torch.long).unsqueeze(1) r = bbox.long() # work around script not liking bitwise ops rectangle_mask = ((((x == r[0]) + (x == r[2])) * (y >= r[1]) * (y <= r[3])) + (((y == r[1]) + (y == r[3])) * (x >= r[0]) * (x <= r[2]))) mask = (mask + rectangle_mask).clamp(max=1) return mask
def __getitem__(self, idx): fn = self.audioFileNames[idx] try: audio, samplerate = torchaudio.load( fn, normalization=False) # don't normalize audio except: print("CAUGHT EXCEPTION: couldn't open file" + fn + '. Going to return empty tensor') samplerate = 22050 audio = torch.zeros(40000).view(1, -1) if (samplerate != 22050): raise Exception( "Input file sample rate is {}, expected 22050".format( samplerate)) num_elem_wanted = 40000 # keep 79787 elems so spectrogram will have len 500 which covers most examples if audio.numel() <= num_elem_wanted: # pad the input on both sides pad_size = (num_elem_wanted - audio.numel()) // 2 # we need to account for off by 1 errors due to integer division --> pad by small number to avoid numerical error if 2 * pad_size + audio.numel() == num_elem_wanted: audio = torch.constant_pad_nd(audio, pad=(pad_size, pad_size), value=0) else: audio = torch.constant_pad_nd(audio, pad=(pad_size, pad_size + 1), value=0) else: # slice input in the middle start = audio.shape[1] // 2 - num_elem_wanted // 2 end = start + num_elem_wanted audio = audio[:, start:end] # pdb.set_trace() # assert((audio.shape[0],audio.shape[1]) == (1, num_elem_wanted)) if self.transform is not None: audio = self.transform(audio[0]) SMALL_CONSTANT = 1e-5 audio = audio + SMALL_CONSTANT # to avoid numerical errors audio = audio.log2() speaker = self.audioFileLabels[idx] audio = audio.unsqueeze(0) return audio, speaker
def forward(self, x): h, w = x.shape[-2:] # extra_h = (math.ceil(w / self.stride[1]) - 1) * self.stride[1] - w + self.kernel_size[1] # extra_v = (math.ceil(h / self.stride[0]) - 1) * self.stride[0] - h + self.kernel_size[0] old_extra_h = (math.ceil(w / self.stride[1]) - 1) * self.stride[1] - w + self.kernel_size[1] old_extra_v = (math.ceil(h / self.stride[0]) - 1) * self.stride[0] - h + self.kernel_size[0] if self.kernel_size[0] == 3: if self.stride[0] == 2: extra_h, extra_v = 1, 1 else: extra_h, extra_v = 2, 2 elif self.kernel_size[0] == 1: extra_h, extra_v = 0, 0 elif self.kernel_size[0] == 5: if self.stride[0] == 2: extra_h, extra_v = 3, 3 else: extra_h, extra_v = 4, 4 if extra_h != old_extra_h or extra_v != old_extra_v: print(w, h, self.stride, self.kernel_size, extra_h, extra_v, old_extra_h, old_extra_v) exit() left = extra_h // 2 right = extra_h - left top = extra_v // 2 bottom = extra_v - top # x = F.pad(x, [left, right, top, bottom]) x = torch.constant_pad_nd(x,(left, right, top, bottom)) x = self.pool(x) return x
def transform_points_homogeneous(points: InputTensor, matrix: InputTensor, w: float) -> t.Tensor: """Transforms a batch of 3D points with a batch of matrices. Args: points: The points to transform, float32[d1, ..., dn, num_points, 3] matrix: The transformation matrix, float32[d1, ..., dn, 4, 4] w: The W value to use. Should be 1 for affine points, 0 for vectors Returns: The transformed points in homogeneous space, float32[d1, ..., dn, num_points, 4] """ points = util.to_tensor(points, dtype=t.float32) matrix = util.to_tensor(matrix, dtype=t.float32) assert points.shape[-1] == 3 assert matrix.shape[-2:] == (4, 4) assert points.shape[:-2] == matrix.shape[:-2] batch_dims = points.shape[:-2] # Fold all batch dimensions into a single one points = points.reshape([-1] + list(points.shape[-2:])) matrix = matrix.reshape([-1] + list(matrix.shape[-2:])) points = t.constant_pad_nd(points, [0, 1], value=w) result = t.einsum("bnm,bvm->bvn", matrix, points) result = result.reshape(batch_dims + result.shape[-2:]) return result
def __getitem__(self, idx): fn = self.audio_files[idx] # pdb.set_trace() audio, samplerate = torchaudio.load( fn, normalization=False) # don't normalize audio if (samplerate != 16e3): raise Exception( "Input file sample rate is {}, expected 16000".format( samplerate)) num_elem_wanted = 79900 # keep 79787 elems so spectrogram will have len 500 which covers most examples if audio.numel() <= num_elem_wanted: # pad the input on both sides pad_size = (num_elem_wanted - audio.numel()) // 2 # we need to account for off by 1 errors due to integer division --> pad by small number to avoid numerical error if 2 * pad_size + audio.numel() == num_elem_wanted: audio = torch.constant_pad_nd(audio, pad=(pad_size, pad_size), value=0) else: audio = torch.constant_pad_nd(audio, pad=(pad_size, pad_size + 1), value=0) else: # slice input in the middle start = audio.shape[1] // 2 - num_elem_wanted // 2 end = start + num_elem_wanted audio = audio[:, start:end] assert ((audio.shape[0], audio.shape[1]) == (1, num_elem_wanted)) if self.transform is not None: audio = self.transform(audio[0]) SMALL_CONSTANT = 1e-5 audio = audio + SMALL_CONSTANT # to avoid numerical errors audio = audio.log2() speaker = self.audio_labels[idx] audio = audio.unsqueeze(0) return audio, speaker
def state_to_tensor(state): rover, rocks, qualities = state # 0 for rover position, 1 for rock position, 2 for quality tensor = t.zeros((3, RockSample.MAP_SIZE, RockSample.MAP_SIZE + 1)) tensor[0, rover.y, rover.x] = 1 for rock, quality in zip(rocks, qualities): tensor[1, rock.y, rock.x] = 1 tensor[2, rock.y, rock.x] = (1 if quality == 1 else -1) return t.constant_pad_nd(tensor, (1, 1, 1, 1), value=-1)
def step(self, batch, batch_idx): src, tgt = batch x, num_usages, _ = src # BxUxL, B tgt, tgt_length = tgt # TxB, B memory = self.encoder(x, num_usages) out = self.decoder(memory, tgt, num_usages, tgt_length) tgt_to_loss = torch.constant_pad_nd(tgt, (0, 0, 0, 1), self.dm.target_pad_idx)[1:, ...] # TxB # [[<s>], [<token>], [</s>], [<pad>]]] -> [[<token>], [</s>], [<pad>], [<pad>]] out_to_loss = out.transpose(1, 2) # TxBxV -> TxVxB return self.loss(out_to_loss, tgt_to_loss), out, tgt # 1, TxBxV, TxB
def make_bce_and_rank_targets(input_graph: Batch, target_graph: Batch, filename: str, *, num_classes): """Binary and rank encoding of unique predicates""" unique_predicates = torch.unique(target_graph.predicate_classes, sorted=False) target_graph.predicate_bce = (torch.zeros(num_classes, dtype=torch.float).scatter_( dim=0, index=unique_predicates, value=1.0).view(1, -1)) target_graph.predicate_rank = torch.constant_pad_nd( unique_predicates, pad=(0, num_classes - len(unique_predicates)), value=-1).view(1, -1) return input_graph, target_graph
def render(self, camera_matrix: t.Tensor, output_shape: Tuple[int, int]) -> t.Tensor: # Resize to output shape, preserving aspect ratio image = vF.to_pil_image(self.image.cpu()) _, sh, sw = self.image.shape scale = min(output_shape[0] / sh, output_shape[1] / sw) th, tw = round(sh * scale), round(sw * scale) th, tw = min(th, output_shape[0]), min(tw, output_shape[1]) result = vF.to_tensor(vF.resize(image, (th, tw))) # type: t.Tensor result = (result * 255).clamp(0, 255).to(t.uint8).permute([1, 2, 0]) pad_top, pad_left = (output_shape[0] - th) // 2, (output_shape[1] - tw) // 2 result = t.constant_pad_nd(result, [ pad_top, output_shape[0] - pad_top - th, pad_left, output_shape[0] - pad_left - tw, 0, 0 ]) result = result.contiguous() return result
def translate(v: InputTensor) -> t.Tensor: """Computes a translation matrix. Args: v: The translation vector, float32[B1, ..., BK, N]. Returns: The translation matrix, float32[B1, ..., BK, N + 1, N + 1] """ result = util.to_tensor(v, dtype=t.float32) assert len(result.shape) >= 1 dimensions = result.shape[-1] result = result[..., None, :].transpose(-1, -2) result = t.constant_pad_nd(result, [dimensions, 0, 0, 1]) id_matrix = t.diag(result.new_ones([dimensions + 1])) id_matrix = id_matrix.expand_as(result) result = result + id_matrix return result
def __init__(self, grid: t.Tensor, voxel_to_world: t.Tensor, palette: t.Tensor = None, filter_kernel: int = 1): """Initializes the artifact. Accepts both tensors with and without a batch dimension. Args: grid: float32[num_objects, depth, height, width]. voxel_to_world: Matrix that converts from voxel to view space, float32[batch_size, 4, 4] palette: The colors to use for the different meshes. float32[batch_size, max_num_meshes, 3] filter_kernel: The size of the smoothing filter kernel to apply """ grid = util.to_tensor(grid, dtype=t.float32) assert len(grid.shape) == 4 voxel_to_world = util.to_tensor(voxel_to_world, t.float32, grid.device) assert voxel_to_world.shape == (4, 4) if filter_kernel > 1: k = filter_kernel grid = t.constant_pad_nd(grid, [(k - 1) // 2, k - 1 - (k - 1) // 2] * 3) kernel = grid.new_ones([1, 1, k, k, k], dtype=t.float32) / k**3 grid = F.conv3d(grid[np.newaxis], kernel).squeeze(0) (vertices, normals, mesh_num_tri) = MarchingCubesArtifact.to_marching_cubes(grid[1:]) vertices = transformations.transform_mesh(vertices, voxel_to_world, True) normals = transformations.transform_mesh(normals, voxel_to_world, False) if palette is not None: palette = palette[1:] self.mesh_artifact = MultiMeshArtifact(vertices=vertices, normals=normals, mesh_num_tri=mesh_num_tri, mesh_colors=palette)
def __init__(self, dataset: t.utils.data.Dataset, global_rank: int, global_world_size: int, pad_data: bool): super().__init__(dataset) if pad_data: total_size = (len(dataset) + global_world_size - 1) // global_world_size total_size *= global_world_size else: total_size = len(dataset) g = t.Generator() # Shuffle data among workers in a stable way. g.manual_seed(0x1234) indices = t.randperm(len(dataset), generator=g) indices = t.constant_pad_nd(indices, [0, total_size - indices.shape[0]]) start = global_rank * total_size // global_world_size end = (global_rank + 1) * total_size // global_world_size self.indices = indices[start:end]
def to_marching_cubes( cls, voxel_grid: t.Tensor) -> Tuple[t.Tensor, t.Tensor, t.Tensor]: """Converts a voxel grid to a marching cubes mesh. Args: voxel_grid: The voxel grid, float32[num_objects, depth, height, width] Returns: vertices: The scene vertex positions, float32[num_triangles, 3, 3] normals: The scene vertex normals, float32[num_triangles, 3, 3] mesh_num_tri: The number of triangles in each mesh, int32[num_meshes] """ voxel_grid = util.to_tensor(voxel_grid, dtype=t.float32) assert len(voxel_grid.shape) == 4 triangles = [] normals = [] mesh_num_tri = [] for grid in voxel_grid: grid = t.constant_pad_nd(grid, [1] * 6) if (grid > 0.5).sum() == 0: triangles.append(t.ones([1, 3, 3])) normals.append(t.ones([1, 3, 3])) mesh_num_tri.append(1) continue mc_result = skimage.measure.marching_cubes(grid.cpu().numpy(), level=0.5) mc_result = [t.as_tensor(v.copy()) for v in mc_result[:3]] vbuf, ibuf, nbuf = mc_result ibuf = ibuf.to(t.int64) assert ibuf.shape[0] > 0 normals.append(nbuf[ibuf]) triangles.append(vbuf[ibuf]) mesh_num_tri.append(ibuf.shape[0]) device = voxel_grid.device triangles = t.cat(triangles, dim=0).flip(-1).to(device) normals = t.cat(normals, dim=0).flip(-1).to(device) mesh_num_tri = util.to_tensor(mesh_num_tri, t.int32).to(device) return triangles, normals, mesh_num_tri
def _resize_fft_input(x: TensorLikeType, dims: Tuple[int, ...], sizes: Tuple[int, ...]) -> TensorLikeType: """ Fixes the shape of x such that x.size(dims[i]) == sizes[i], either by zero-padding, or by slicing x starting from 0. """ assert len(dims) == len(sizes) must_copy = False x_sizes = x.shape pad_amount = [0] * len(x_sizes) * 2 for i in range(len(dims)): if sizes[i] == -1: continue if x_sizes[dims[i]] < sizes[i]: must_copy = True pad_idx = len(pad_amount) - 2 * dims[i] - 1 pad_amount[pad_idx] = sizes[i] - x_sizes[dims[i]] if x_sizes[dims[i]] > sizes[i]: x = x.narrow(dims[i], 0, sizes[i]) return torch.constant_pad_nd(x, pad_amount) if must_copy else x
def forward(self, grid2d: t.Tensor, voxel_projection_matrix: t.Tensor, voxel_sample_location: t.Tensor, outside_value: float = 0, flip_x=False, flip_y=False): """The forward pass. Args: grid2d: The 2D grid, float32[batch_size, num_channels, height, width]. voxel_projection_matrix: Matrix that projects voxel centers onto the screen, float32[batch_size, 4, 4]. voxel_sample_location: 3D sample location within the voxels, float32[3]. outside_value: Value used to fill the channels for voxels whose projected position is outside the 2D grid, float32[] flip_x: Whether to flip the 2D grid along the X dimension. This can be used to correct for a right/left handed 3D coordinate system issues. flip_y: Whether to flip the 2D grid along the Y dimension. This can be used to correct for a right/left handed 3D coordinate system issues. Returns: The resulting 3D grid, float32[batch_size, num_channels, depth, height, width]. The content of cell [b, c, z, y, x] in the result will be equal to grid2d[b, c, py, px], where (px, py, _) = affine_transform( voxel_projection_matrix, (x, y, z, 1)) * (height, width, 1). If (b, py, px) lies outside the 2D image, the content of the cell in all channels will be equal to outside_value. """ grid2d = util.to_tensor(grid2d, t.float32) assert len(grid2d.shape) == 4 voxel_sample_location = util.to_tensor(voxel_sample_location, t.float32) assert voxel_sample_location.shape == (grid2d.shape[0], 3) compressed_grid2d = self.compress_channels(grid2d) batch_size, channels, height, width = compressed_grid2d.shape voxel_projection_matrix = util.to_tensor(voxel_projection_matrix, t.float32) assert voxel_projection_matrix.shape == (batch_size, 4, 4) voxel_centers = self.voxel_centers grid_depth, grid_height, grid_width, _ = voxel_centers.shape # shape: [batch, depth, height, width, 3] voxel_centers = (voxel_centers[None].expand(batch_size, grid_depth, grid_height, grid_width, 3).contiguous()) voxel_centers = (voxel_centers + voxel_sample_location[:, None, None, None, :]) # shape: [batch, depth * height * width, 3] voxel_centers = voxel_centers.reshape([batch_size, -1, 3]) # Project the voxel centers onto the screen projected_centers = transformations.transform_points_homogeneous( voxel_centers, voxel_projection_matrix, w=1) projected_centers = projected_centers.reshape( [batch_size, grid_depth, grid_height, grid_width, 4]) camera_depth = projected_centers[..., 2] projected_centers = projected_centers[..., :3] / projected_centers[..., 3:4] # XY range in OpenGL camera space is [-1:1, -1:1]. Transform to [0:1, 0:1]. projected_centers = projected_centers[..., :2] / 2 + 0.5 if flip_y: projected_centers = projected_centers * (1, -1) + (0, 1) if flip_x: projected_centers = projected_centers * (-1, 1) + (1, 0) # projected_centers contains (x, y) coordinates in [0, 1]^2 at this point. # Convert to indices into 2D grid. wh = projected_centers.new_tensor([[[[[width, height]]]]], dtype=t.float32) pixel_indices = (projected_centers * wh).to(t.int64) xx, yy = pixel_indices.unbind(-1) # type: t.Tensor bb = t.arange(batch_size, dtype=t.int64, device=grid2d.device) bb = bb[:, None, None, None] bb = bb.expand(batch_size, grid_depth, grid_height, grid_width) # Pad the grid to detect voxels which project outside the image plane padded_grid2d = t.constant_pad_nd(compressed_grid2d, [1, 1, 1, 1], value=outside_value) xx = (xx + 1).clamp(0, padded_grid2d.shape[-1] - 1) yy = (yy + 1).clamp(0, padded_grid2d.shape[-2] - 1) # Sample the 2D grid result = padded_grid2d[bb, :, yy, xx].permute([0, 4, 1, 2, 3]) assert result.shape == (batch_size, channels, grid_depth, grid_height, grid_width) # Discard voxels behind the camera camera_depth = camera_depth[:, None, :, :, :].expand(result.shape) result = t.where(camera_depth >= 0, result, t.ones_like(result) * outside_value) return result
def render_scene(vertex_positions: InputTensor, view_projection_matrix: InputTensor = None, image_size: Tuple[int, int] = (256, 256), normals: InputTensor = None, tex_coords: InputTensor = None, material_ids: InputTensor = None, diffuse_coefficients: InputTensor = None, diffuse_textures: InputTensor = None, diffuse_texture_indices: InputTensor = None, specular_coefficient: InputTensor = None, ambient_coefficients: InputTensor = None, cull_back_facing=True, light_position: InputTensor = None, light_color: InputTensor = (1.0, 1.0, 1.0), ambient_light_color: InputTensor = (0.2, 0.2, 0.2), clear_color: InputTensor = (0, 0, 0, 1), output_type=t.uint8, vertex_shader=None, geometry_shader=None, fragment_shader=None, debug_io_buffer=None, return_rgb=True, cuda_device=None): """Renders the given scene. Args: vertex_positions: The triangle geometry, specified through the triangle vertex positions, float32[num_triangles, 3, 3] view_projection_matrix: The view projection matrix, float32[4, 4] image_size: Desired output image size, (height, width), normals: Per-vertex shading normals, float32[num_triangles, 3, 3]. If set to None, normals will be computed from the vertex positions. tex_coords: Texture coordinate, float32[num_triangles, 3, 2]. If set to None, all texture coordinates will be 0. material_ids: Per-triangle material indices used to index in the various coefficient tensors below, int32[num_triangles]. If set to None, all triangles will have the same default material. diffuse_coefficients: The diffuse coefficients, one per material, float32[num_materials, 3]. Cannot be None if material_ids is not None. Must be None if material_ids is None. diffuse_textures: uint8[num_textures, height, width, 3]. Can be None if there are no textures used in the mesh. diffuse_texture_indices: Diffuse texture indices, one per material, int32[num_materials]. If set to None, the texture indices for all materials will be -1. specular_coefficient: Specular coefficients, one per material, float32[num_materials, 4]. The first 3 channels are the R, G, and B specular coefficients, the last channel is the specular power. If set to None, R, G, and B will be 0 for all materials and power will be 2048. ambient_coefficients: float32[num_materials, 3]. The ambient coefficients. If None, all ambient coefficient will be 0.05. cull_back_facing: whether to cull backfacing triangles. light_position: float32[3], the light position. If set to None, the light will be placed at the camera origin. light_color: The light diffuse RGB color, float32[3] ambient_light_color: The light ambient RGB color, float32[3] clear_color: The RGB color to use when clearing the image, float32[3] output_type: The desired output type. Either tf.uint8 or tf.float32. vertex_shader: The vertex shader to use. If empty, uses a default shader. geometry_shader: The geometry shader. If empty, uses a default shader. fragment_shader: The fragment shader. If empty, uses a default shader. debug_io_buffer: Aids debugging of shaders. Shaders can communicate with host programs through OpenGL input/output buffers. Any tensor passed in this argument will be forwarded to the shaders as buffer with name "debug_io". return_rgb: If true, returns a 3 channel image, otherwise returns a 4 channel image. cuda_device: The index of the GPU to use, given as CUDA device Returns: The rendered image, dt[height, width, c] where dt is either float32 or uint8 depending on the value of output_type and c is either 3 or 4, depending on return_rgb. If the debug_io_buffer argument was not None, returns a tuple containing the rendered image, and the shader output from the "debug_io" buffer. The second element of the tuple has the same shape and type as debug_io_buffer. """ height, width = image_size vertex_positions = util.to_tensor(vertex_positions, t.float32, "cpu") assert (len(vertex_positions.shape) == 3 and vertex_positions.shape[1:] == (3, 3)) num_triangles = vertex_positions.shape[0] if view_projection_matrix is None: view_projection_matrix = camera_util.get_default_camera_for_mesh( vertex_positions) view_projection_matrix = util.to_tensor(view_projection_matrix, t.float32, "cpu") assert view_projection_matrix.shape == (4, 4) has_normals = True if normals is None: normals = t.zeros_like(vertex_positions) has_normals = False normals = util.to_tensor(normals, t.float32, "cpu") assert normals.shape == (num_triangles, 3, 3) if tex_coords is None: tex_coords = t.zeros([num_triangles, 3, 2], dtype=t.float32) tex_coords = util.to_tensor(tex_coords, t.float32, "cpu") assert tex_coords.shape == (num_triangles, 3, 2) if material_ids is None: material_ids = t.zeros([num_triangles], dtype=t.int32) material_ids = util.to_tensor(material_ids, t.int32, "cpu") assert material_ids.shape == (num_triangles, ) num_used_materials = material_ids.max().cpu().numpy() + 1 # type: int def create_coefficient_array(cur_tensor: InputTensor, num_channels, default_value): arr = cur_tensor if arr is None: arr = ( t.ones([num_used_materials, num_channels], dtype=t.float32) * t.tensor(default_value)) arr = util.to_tensor(arr, t.float32, "cpu") assert len(arr.shape) == 2 arr = arr[:num_used_materials] assert arr.shape == (num_used_materials, num_channels) return arr diffuse_coefficients = create_coefficient_array(diffuse_coefficients, 3, 0.8) ambient_coefficients = create_coefficient_array(ambient_coefficients, 3, 0.05) specular_coefficient = create_coefficient_array(specular_coefficient, 4, (0, 0, 0, 2048.0)) if diffuse_texture_indices is None: diffuse_texture_indices = t.ones([num_used_materials], dtype=t.int32) * -1 diffuse_texture_indices = util.to_tensor(diffuse_texture_indices, t.int32, "cpu") assert len(diffuse_texture_indices.shape) == 1 diffuse_texture_indices = diffuse_texture_indices[:num_used_materials] assert diffuse_texture_indices.shape == (num_used_materials, ) num_used_textures = diffuse_texture_indices.max().cpu().numpy() + 1 num_used_textures = max(num_used_textures, 1) if diffuse_textures is None: diffuse_textures = t.ones([num_used_textures, 1, 1, 3], dtype=t.uint8) diffuse_textures = util.to_tensor(diffuse_textures, t.uint8, "cpu") assert len(diffuse_textures.shape) == 4 diffuse_textures = diffuse_textures[:num_used_textures] assert (diffuse_textures.shape[0] == num_used_textures and diffuse_textures.shape[3] == 3) camera_position = t.mv(t.inverse(view_projection_matrix), t.tensor([0, 0, -1, 1], dtype=t.float32)) camera_position = camera_position[:3] / camera_position[3] if light_position is None: light_position = camera_position light_position = util.to_tensor(light_position, t.float32, "cpu") assert light_position.shape == (3, ) light_color = util.to_tensor(light_color, t.float32, "cpu") assert light_color.shape == (3, ) ambient_light_color = util.to_tensor(ambient_light_color, t.float32, "cpu") assert ambient_light_color.shape == (3, ) ambient_coefficients = t.constant_pad_nd(ambient_coefficients, [0, 1]) diffuse_coefficients = t.cat([ diffuse_coefficients, diffuse_texture_indices.to(t.float32)[:, np.newaxis] ], -1) materials = t.cat( [ambient_coefficients, diffuse_coefficients, specular_coefficient], dim=-1) render_args = [ rasterizer.Uniform("view_projection_matrix", view_projection_matrix), rasterizer.Uniform("light_position", light_position), rasterizer.Uniform("has_normals", has_normals), rasterizer.Uniform("has_texcoords", True), rasterizer.Buffer(0, vertex_positions.reshape([-1])), rasterizer.Buffer(1, normals.reshape([-1])), rasterizer.Buffer(2, tex_coords.reshape([-1])), rasterizer.Buffer(3, material_ids.reshape([-1])), rasterizer.Buffer(4, materials.reshape([-1])), rasterizer.Texture("textures", diffuse_textures, bind_as_array=True), rasterizer.Uniform("light_color", light_color), rasterizer.Uniform("camera_position", camera_position), rasterizer.Uniform("ambient_light_color", ambient_light_color), rasterizer.Uniform("cull_backfacing", cull_back_facing), ] if debug_io_buffer is not None: render_args.append(rasterizer.Buffer(5, debug_io_buffer, is_io=True)) if not geometry_shader: geometry_shader = resources.read_text(shaders, "triangle_renderer.geom") if not vertex_shader: vertex_shader = resources.read_text(shaders, "noop.vert") if not fragment_shader: fragment_shader = resources.read_text(shaders, "point_light_illumination.frag") result = rasterizer.gl_simple_render(rasterizer.RenderInput( num_points=num_triangles, arguments=render_args, output_resolution=(height, width), clear_color=clear_color, output_type=output_type, vertex_shader=vertex_shader, geometry_shader=geometry_shader, fragment_shader=fragment_shader), cuda_device=cuda_device) c = 3 if return_rgb else 4 if debug_io_buffer is None: return result[..., :c] else: return result[..., :c], render_args[-1].value
def beam_search(self, encoder_output, max_output_length, init_idx, eos_idx, pad_idx=None, topk=10, beam_width=40, batch_size=10): """ :param encoder_output: Mx1xE, :param max_output_length: length of generated sequence, :param init_idx: index of INIT token, :param eos_idx: index of EOS token, :param pad_idx: index of PAD token. :param topk: :param beam_width: :param batch_size: """ with self.no_tgt_memory_limit(): # Start with the start of the sentence token init_token = torch.full((1, 1), init_idx, dtype=torch.long) # Number of sentence to generate endnodes = [] # starting node - previous node, word id, logp, length node = BeamSearchNode(init_token, 0) nodes = PriorityQueue() # start the queue nodes.put((-node.eval(), node)) qsize = 1 # start beam search # give up when decoding takes too long while qsize < 10000: if batch_size is None: # fetch the best node score, n = nodes.get() qsize -= 1 decoder_input = n.token_ids if decoder_input.shape[ 0] >= max_output_length or decoder_input[ -1, 0] == eos_idx: endnodes.append((score, n)) # if we reached maximum # of sentences required if len(endnodes) >= topk: break else: continue # decode for one step using decoder decoder_output = self( encoder_output, decoder_input.to(encoder_output.device))[-1:, :, :] decoder_output = log_softmax(decoder_output, dim=-1).cpu() # PUT HERE REAL BEAM SEARCH OF TOP log_prob, indexes = torch.topk(decoder_output, beam_width) for i in range(beam_width): new_idx = indexes[:, :, i] log_p = log_prob[0, 0, i].item() node = BeamSearchNode( torch.cat((decoder_input, new_idx)), n.log_p + log_p) score = -node.eval() nodes.put((score, node)) qsize += 1 else: assert pad_idx is not None, "Specify pad_idx, please." # fetch batch of the best nodes decoder_inputs, log_ps = [], [] while qsize > 0 and len(decoder_inputs) < batch_size: score, n = nodes.get() qsize -= 1 token_ids = n.token_ids if token_ids.shape[0] >= max_output_length or token_ids[ -1, 0] == eos_idx: endnodes.append((score, n)) if len(endnodes) >= topk: break else: continue decoder_inputs.append(token_ids) log_ps.append(n.log_p) if len(endnodes) >= topk: break # pad decoder_inputs tgt_lengths = torch.LongTensor( list(map(lambda x: x.shape[0], decoder_inputs)), device=encoder_output.device) max_length = tgt_lengths.max() decoder_input = torch.cat([ torch.constant_pad_nd( inp, (0, 0, 0, max_length - inp.shape[0]), pad_idx) for inp in decoder_inputs ], dim=1).to(encoder_output.device) # decode batch decoder_outputs = self(encoder_output.expand( -1, decoder_input.shape[1], -1), decoder_input, tgt_length=tgt_lengths).cpu() for i, (decoder_input, log_p, tgt_length) in enumerate( zip(decoder_inputs, log_ps, tgt_lengths)): decoder_output = log_softmax( decoder_outputs[tgt_length - 1, i], dim=-1) new_log_ps, new_idxs = torch.topk( decoder_output, beam_width) for new_log_p, new_idx in zip(new_log_ps, new_idxs): node = BeamSearchNode( torch.cat((decoder_input, new_idx.view(1, 1))), log_p + new_log_p.item()) score = -node.eval() nodes.put((score, node)) qsize += 1 return sorted(endnodes)
def inference_nopad(self, inputs): """ 不用pad的语音合成推理。 有pad的batch模式语音合成推理容易出现合成错误问题,合成效果不稳定。 用batch模式合成且不用pad还没打通,这个暂用非batch的合成方式。 Args: inputs: Returns: """ text_, style_input_, speaker_ids_, f0s_ = inputs f0s_ = [f0s_ for w in range(len(text_))] mel_outputs_lst, mel_outputs_postnet_lst, gate_outputs_lst, alignments_lst = [], [], [], [] for text, style_input, speaker_ids, f0s in zip(text_, style_input_, speaker_ids_, f0s_): text = text[:torch.argmin(text) + 1] # 去除pad的0 embedded_inputs = self.embedding(text.unsqueeze(0)).transpose(1, 2) embedded_text = self.encoder.inference(embedded_inputs) embedded_speakers = self.speaker_embedding( speaker_ids.unsqueeze(0))[:, None] if hasattr(self, 'gst'): if isinstance(style_input, int): query = torch.zeros(1, 1, self.gst.encoder.ref_enc_gru_size).to( _device) # cuda() GST = torch.tanh(self.gst.stl.embed) key = GST[style_input].unsqueeze(0).expand(1, -1, -1) embedded_gst = self.gst.stl.attention(query, key) else: embedded_gst = self.gst(style_input.unsqueeze(0)) embedded_speakers = embedded_speakers.repeat( 1, embedded_text.size(1), 1) if hasattr(self, 'gst'): embedded_gst = embedded_gst.repeat(1, embedded_text.size(1), 1) encoder_outputs = torch.cat( (embedded_text, embedded_gst, embedded_speakers), dim=2) else: encoder_outputs = torch.cat((embedded_text, embedded_speakers), dim=2) mel_outputs, gate_outputs, alignments = self.decoder.inference( encoder_outputs, f0s) mel_outputs_postnet = self.postnet(mel_outputs) mel_outputs_postnet = mel_outputs + mel_outputs_postnet mel_outputs_lst.append(mel_outputs) mel_outputs_postnet_lst.append(mel_outputs_postnet) gate_outputs_lst.append(gate_outputs) alignments_lst.append(alignments) maxlen = max([w.shape[2] for w in mel_outputs_postnet_lst]) # pad一个很小的负数才是静音 mel_outputs_postnet_lst = [ torch.constant_pad_nd(w, (0, maxlen - w.shape[2]), -16) for w in mel_outputs_postnet_lst ] mel_outputs_postnet = torch.cat(mel_outputs_postnet_lst, dim=0) # pad一个很小的负数才是静音 mel_outputs_lst = [ torch.constant_pad_nd(w, (0, maxlen - w.shape[2]), -16) for w in mel_outputs_lst ] mel_outputs = torch.cat(mel_outputs_lst, dim=0) # pad数字1才是截断 gate_outputs_lst = [ torch.constant_pad_nd(w, (0, 0, 0, maxlen - w.shape[1]), 1) for w in gate_outputs_lst ] gate_outputs = torch.cat(gate_outputs_lst, dim=0) maxlen_text = max([w.shape[0] for w in text_]) alignments_lst = [ torch.constant_pad_nd( w, (0, maxlen_text - w.shape[2], 0, maxlen - w.shape[1]), 0) for w in alignments_lst ] alignments = torch.cat(alignments_lst, dim=0) return self.parse_output( [mel_outputs, mel_outputs_postnet, gate_outputs, alignments])