示例#1
0
    def test_constant_pad_nd_memory_format(self, device, dtype):
        # Test memory format is preserved in unambiguous cases
        for mf, ndim in (
            (torch.channels_last, 4),
            (torch.contiguous_format, 4),
            (torch.channels_last_3d, 5),
            (torch.contiguous_format, 5),
        ):
            a = torch.zeros([2] * ndim).to(memory_format=mf)
            res = refs.constant_pad_nd(a, pad=[1] * (2 * ndim))
            self.assertTrue(res.is_contiguous(memory_format=mf))

        # Ambiguous cases

        # is_channels_last_ and is_contiguous_, results in channels_last output
        a = torch.empty_strided((2, 1, 2, 2), stride=(4, 1, 2, 1))
        self.assertTrue(a.is_contiguous(memory_format=torch.channels_last))
        self.assertTrue(a.is_contiguous())
        actual = refs.constant_pad_nd(a, pad=[1] * 8)
        expect = torch.constant_pad_nd(a, pad=[1] * 8)
        self.assertEqual(actual.stride(), expect.stride())
        self.assertTrue(
            actual.is_contiguous(memory_format=torch.channels_last))

        # is_channels_last_contiguous_ but not is_channels_last_, results in
        # contiguous output
        a = torch.empty_strided((2, 1, 2, 2), stride=(4, 4, 2, 1))
        self.assertTrue(a.is_contiguous(memory_format=torch.channels_last))
        self.assertTrue(a.is_contiguous())
        actual = refs.constant_pad_nd(a, pad=[1] * 8)
        expect = torch.constant_pad_nd(a, pad=[1] * 8)
        self.assertEqual(actual.stride(), expect.stride())
        self.assertTrue(actual.is_contiguous())
示例#2
0
def my_paste_mask(mask,
                  bbox,
                  height,
                  width,
                  threshold=0.5,
                  padding=1,
                  contour=True,
                  rectangle=False):
    # type: (Tensor, Tensor, int, int, float, int, bool, bool) -> Tensor
    padded_mask = torch.constant_pad_nd(mask,
                                        (padding, padding, padding, padding))
    scale = 1.0 + 2.0 * float(padding) / float(mask.size(-1))
    center_x = (bbox[2] + bbox[0]) * 0.5
    center_y = (bbox[3] + bbox[1]) * 0.5
    w_2 = (bbox[2] - bbox[0]) * 0.5 * scale
    h_2 = (bbox[3] - bbox[1]) * 0.5 * scale  # should have two scales?
    bbox_scaled = torch.stack(
        [center_x - w_2, center_y - h_2, center_x + w_2, center_y + h_2], 0)

    TO_REMOVE = 1
    w = (bbox_scaled[2] - bbox_scaled[0] + TO_REMOVE).clamp(min=1).long()
    h = (bbox_scaled[3] - bbox_scaled[1] + TO_REMOVE).clamp(min=1).long()

    scaled_mask = torch.ops.maskrcnn_benchmark.upsample_bilinear(
        padded_mask.float(), h, w)

    x0 = bbox_scaled[0].long()
    y0 = bbox_scaled[1].long()
    x = x0.clamp(min=0)
    y = y0.clamp(min=0)
    leftcrop = x - x0
    topcrop = y - y0
    w = torch.min(w - leftcrop, width - x)
    h = torch.min(h - topcrop, height - y)

    # mask = torch.zeros((height, width), dtype=torch.uint8)
    # mask[y:y + h, x:x + w] = (scaled_mask[topcrop:topcrop + h,  leftcrop:leftcrop + w] > threshold)
    mask = torch.constant_pad_nd(
        (scaled_mask[topcrop:topcrop + h, leftcrop:leftcrop + w] > threshold),
        (int(x), int(width - x - w), int(y),
         int(height - y - h)))  # int for the script compiler

    if contour:
        mask = mask.float()
        # poor person's contour finding by comparing to smoothed
        mask = (mask -
                torch.nn.functional.conv2d(mask.unsqueeze(0).unsqueeze(0),
                                           torch.full((1, 1, 3, 3), 1.0 / 9.0),
                                           padding=1)[0, 0]).abs() > 0.001
    if rectangle:
        x = torch.arange(width, dtype=torch.long).unsqueeze(0)
        y = torch.arange(height, dtype=torch.long).unsqueeze(1)
        r = bbox.long()
        # work around script not liking bitwise ops
        rectangle_mask = ((((x == r[0]) + (x == r[2])) * (y >= r[1]) *
                           (y <= r[3])) + (((y == r[1]) + (y == r[3])) *
                                           (x >= r[0]) * (x <= r[2])))
        mask = (mask + rectangle_mask).clamp(max=1)
    return mask
示例#3
0
    def __getitem__(self, idx):
        fn = self.audioFileNames[idx]

        try:
            audio, samplerate = torchaudio.load(
                fn, normalization=False)  # don't normalize audio
        except:
            print("CAUGHT EXCEPTION: couldn't open file" + fn +
                  '. Going to return empty tensor')
            samplerate = 22050
            audio = torch.zeros(40000).view(1, -1)
        if (samplerate != 22050):
            raise Exception(
                "Input file sample rate is {}, expected 22050".format(
                    samplerate))

        num_elem_wanted = 40000  # keep 79787 elems so spectrogram will have len 500 which covers most examples

        if audio.numel() <= num_elem_wanted:
            # pad the input on both sides
            pad_size = (num_elem_wanted - audio.numel()) // 2
            # we need to account for off by 1 errors due to integer division --> pad by small number to avoid numerical error
            if 2 * pad_size + audio.numel() == num_elem_wanted:
                audio = torch.constant_pad_nd(audio,
                                              pad=(pad_size, pad_size),
                                              value=0)
            else:
                audio = torch.constant_pad_nd(audio,
                                              pad=(pad_size, pad_size + 1),
                                              value=0)
        else:
            # slice input in the middle
            start = audio.shape[1] // 2 - num_elem_wanted // 2
            end = start + num_elem_wanted
            audio = audio[:, start:end]

        # pdb.set_trace()
        # assert((audio.shape[0],audio.shape[1]) == (1, num_elem_wanted))
        if self.transform is not None:
            audio = self.transform(audio[0])

        SMALL_CONSTANT = 1e-5
        audio = audio + SMALL_CONSTANT  # to avoid numerical errors
        audio = audio.log2()

        speaker = self.audioFileLabels[idx]
        audio = audio.unsqueeze(0)
        return audio, speaker
示例#4
0
    def forward(self, x):
        h, w = x.shape[-2:]
        
        # extra_h = (math.ceil(w / self.stride[1]) - 1) * self.stride[1] - w + self.kernel_size[1]
        # extra_v = (math.ceil(h / self.stride[0]) - 1) * self.stride[0] - h + self.kernel_size[0]

        old_extra_h = (math.ceil(w / self.stride[1]) - 1) * self.stride[1] - w + self.kernel_size[1]
        old_extra_v = (math.ceil(h / self.stride[0]) - 1) * self.stride[0] - h + self.kernel_size[0]
        if self.kernel_size[0] == 3:
            if self.stride[0] == 2:
                extra_h, extra_v = 1, 1
            else:
                extra_h, extra_v = 2, 2
        elif self.kernel_size[0] == 1:
            extra_h, extra_v = 0, 0
        elif self.kernel_size[0] == 5:
            if self.stride[0] == 2:
                extra_h, extra_v = 3, 3
            else:
                extra_h, extra_v = 4, 4
        if extra_h != old_extra_h or extra_v != old_extra_v:
            print(w, h, self.stride, self.kernel_size, extra_h, extra_v, old_extra_h, old_extra_v)
            exit()

        left = extra_h // 2
        right = extra_h - left
        top = extra_v // 2
        bottom = extra_v - top

        # x = F.pad(x, [left, right, top, bottom])
        x = torch.constant_pad_nd(x,(left, right, top, bottom))

        x = self.pool(x)
        return x
示例#5
0
def transform_points_homogeneous(points: InputTensor, matrix: InputTensor,
                                 w: float) -> t.Tensor:
    """Transforms a batch of 3D points with a batch of matrices.

  Args:
    points: The points to transform, float32[d1, ..., dn, num_points, 3]
    matrix: The transformation matrix, float32[d1, ..., dn, 4, 4]
    w: The W value to use. Should be 1 for affine points, 0 for vectors

  Returns:
    The transformed points in homogeneous space,
    float32[d1, ..., dn, num_points, 4]
  """
    points = util.to_tensor(points, dtype=t.float32)
    matrix = util.to_tensor(matrix, dtype=t.float32)
    assert points.shape[-1] == 3
    assert matrix.shape[-2:] == (4, 4)
    assert points.shape[:-2] == matrix.shape[:-2]

    batch_dims = points.shape[:-2]
    # Fold all batch dimensions into a single one
    points = points.reshape([-1] + list(points.shape[-2:]))
    matrix = matrix.reshape([-1] + list(matrix.shape[-2:]))

    points = t.constant_pad_nd(points, [0, 1], value=w)
    result = t.einsum("bnm,bvm->bvn", matrix, points)
    result = result.reshape(batch_dims + result.shape[-2:])

    return result
示例#6
0
    def __getitem__(self, idx):
        fn = self.audio_files[idx]

        #         pdb.set_trace()

        audio, samplerate = torchaudio.load(
            fn, normalization=False)  # don't normalize audio
        if (samplerate != 16e3):
            raise Exception(
                "Input file sample rate is {}, expected 16000".format(
                    samplerate))

        num_elem_wanted = 79900  # keep 79787 elems so spectrogram will have len 500 which covers most examples

        if audio.numel() <= num_elem_wanted:
            # pad the input on both sides
            pad_size = (num_elem_wanted - audio.numel()) // 2
            # we need to account for off by 1 errors due to integer division --> pad by small number to avoid numerical error
            if 2 * pad_size + audio.numel() == num_elem_wanted:
                audio = torch.constant_pad_nd(audio,
                                              pad=(pad_size, pad_size),
                                              value=0)
            else:
                audio = torch.constant_pad_nd(audio,
                                              pad=(pad_size, pad_size + 1),
                                              value=0)
        else:
            # slice input in the middle
            start = audio.shape[1] // 2 - num_elem_wanted // 2
            end = start + num_elem_wanted
            audio = audio[:, start:end]

        assert ((audio.shape[0], audio.shape[1]) == (1, num_elem_wanted))
        if self.transform is not None:
            audio = self.transform(audio[0])

        SMALL_CONSTANT = 1e-5
        audio = audio + SMALL_CONSTANT  # to avoid numerical errors
        audio = audio.log2()

        speaker = self.audio_labels[idx]
        audio = audio.unsqueeze(0)
        return audio, speaker
示例#7
0
    def state_to_tensor(state):
        rover, rocks, qualities = state

        # 0 for rover position, 1 for rock position, 2 for quality
        tensor = t.zeros((3, RockSample.MAP_SIZE, RockSample.MAP_SIZE + 1))

        tensor[0, rover.y, rover.x] = 1
        for rock, quality in zip(rocks, qualities):
            tensor[1, rock.y, rock.x] = 1
            tensor[2, rock.y, rock.x] = (1 if quality == 1 else -1)

        return t.constant_pad_nd(tensor, (1, 1, 1, 1), value=-1)
示例#8
0
 def step(self, batch, batch_idx):
     src, tgt = batch
     x, num_usages, _ = src  # BxUxL, B
     tgt, tgt_length = tgt  # TxB, B
     memory = self.encoder(x, num_usages)
     out = self.decoder(memory, tgt, num_usages, tgt_length)
     tgt_to_loss = torch.constant_pad_nd(tgt, (0, 0, 0, 1),
                                         self.dm.target_pad_idx)[1:,
                                                                 ...]  # TxB
     # [[<s>], [<token>], [</s>], [<pad>]]] -> [[<token>], [</s>], [<pad>], [<pad>]]
     out_to_loss = out.transpose(1, 2)  # TxBxV -> TxVxB
     return self.loss(out_to_loss, tgt_to_loss), out, tgt  # 1, TxBxV, TxB
示例#9
0
 def make_bce_and_rank_targets(input_graph: Batch, target_graph: Batch,
                               filename: str, *, num_classes):
     """Binary and rank encoding of unique predicates"""
     unique_predicates = torch.unique(target_graph.predicate_classes,
                                      sorted=False)
     target_graph.predicate_bce = (torch.zeros(num_classes,
                                               dtype=torch.float).scatter_(
                                                   dim=0,
                                                   index=unique_predicates,
                                                   value=1.0).view(1, -1))
     target_graph.predicate_rank = torch.constant_pad_nd(
         unique_predicates,
         pad=(0, num_classes - len(unique_predicates)),
         value=-1).view(1, -1)
     return input_graph, target_graph
示例#10
0
 def render(self, camera_matrix: t.Tensor,
            output_shape: Tuple[int, int]) -> t.Tensor:
     # Resize to output shape, preserving aspect ratio
     image = vF.to_pil_image(self.image.cpu())
     _, sh, sw = self.image.shape
     scale = min(output_shape[0] / sh, output_shape[1] / sw)
     th, tw = round(sh * scale), round(sw * scale)
     th, tw = min(th, output_shape[0]), min(tw, output_shape[1])
     result = vF.to_tensor(vF.resize(image, (th, tw)))  # type: t.Tensor
     result = (result * 255).clamp(0, 255).to(t.uint8).permute([1, 2, 0])
     pad_top, pad_left = (output_shape[0] - th) // 2, (output_shape[1] -
                                                       tw) // 2
     result = t.constant_pad_nd(result, [
         pad_top, output_shape[0] - pad_top - th, pad_left,
         output_shape[0] - pad_left - tw, 0, 0
     ])
     result = result.contiguous()
     return result
示例#11
0
def translate(v: InputTensor) -> t.Tensor:
    """Computes a translation matrix.

  Args:
    v: The translation vector, float32[B1, ..., BK, N].

  Returns:
    The translation matrix, float32[B1, ..., BK, N + 1, N + 1]

  """
    result = util.to_tensor(v, dtype=t.float32)
    assert len(result.shape) >= 1
    dimensions = result.shape[-1]
    result = result[..., None, :].transpose(-1, -2)
    result = t.constant_pad_nd(result, [dimensions, 0, 0, 1])
    id_matrix = t.diag(result.new_ones([dimensions + 1]))
    id_matrix = id_matrix.expand_as(result)
    result = result + id_matrix
    return result
示例#12
0
    def __init__(self,
                 grid: t.Tensor,
                 voxel_to_world: t.Tensor,
                 palette: t.Tensor = None,
                 filter_kernel: int = 1):
        """Initializes the artifact.

    Accepts both tensors with and without a batch dimension.

    Args:
      grid: float32[num_objects, depth, height, width].
      voxel_to_world: Matrix that converts from voxel to view space,
        float32[batch_size, 4, 4]
      palette: The colors to use for the different meshes. float32[batch_size,
        max_num_meshes, 3]
      filter_kernel: The size of the smoothing filter kernel to apply
    """
        grid = util.to_tensor(grid, dtype=t.float32)
        assert len(grid.shape) == 4

        voxel_to_world = util.to_tensor(voxel_to_world, t.float32, grid.device)
        assert voxel_to_world.shape == (4, 4)

        if filter_kernel > 1:
            k = filter_kernel
            grid = t.constant_pad_nd(grid,
                                     [(k - 1) // 2, k - 1 - (k - 1) // 2] * 3)
            kernel = grid.new_ones([1, 1, k, k, k], dtype=t.float32) / k**3
            grid = F.conv3d(grid[np.newaxis], kernel).squeeze(0)

        (vertices, normals,
         mesh_num_tri) = MarchingCubesArtifact.to_marching_cubes(grid[1:])

        vertices = transformations.transform_mesh(vertices, voxel_to_world,
                                                  True)
        normals = transformations.transform_mesh(normals, voxel_to_world,
                                                 False)
        if palette is not None:
            palette = palette[1:]
        self.mesh_artifact = MultiMeshArtifact(vertices=vertices,
                                               normals=normals,
                                               mesh_num_tri=mesh_num_tri,
                                               mesh_colors=palette)
示例#13
0
    def __init__(self, dataset: t.utils.data.Dataset, global_rank: int,
                 global_world_size: int, pad_data: bool):
        super().__init__(dataset)
        if pad_data:
            total_size = (len(dataset) + global_world_size -
                          1) // global_world_size
            total_size *= global_world_size
        else:
            total_size = len(dataset)

        g = t.Generator()
        # Shuffle data among workers in a stable way.
        g.manual_seed(0x1234)
        indices = t.randperm(len(dataset), generator=g)
        indices = t.constant_pad_nd(indices,
                                    [0, total_size - indices.shape[0]])

        start = global_rank * total_size // global_world_size
        end = (global_rank + 1) * total_size // global_world_size
        self.indices = indices[start:end]
示例#14
0
    def to_marching_cubes(
            cls, voxel_grid: t.Tensor) -> Tuple[t.Tensor, t.Tensor, t.Tensor]:
        """Converts a voxel grid to a marching cubes mesh.

    Args:
      voxel_grid: The voxel grid, float32[num_objects, depth, height, width]

    Returns:
      vertices: The scene vertex positions, float32[num_triangles, 3, 3]
      normals: The scene vertex normals, float32[num_triangles, 3, 3]
      mesh_num_tri: The number of triangles in each mesh, int32[num_meshes]
    """

        voxel_grid = util.to_tensor(voxel_grid, dtype=t.float32)
        assert len(voxel_grid.shape) == 4

        triangles = []
        normals = []
        mesh_num_tri = []
        for grid in voxel_grid:
            grid = t.constant_pad_nd(grid, [1] * 6)
            if (grid > 0.5).sum() == 0:
                triangles.append(t.ones([1, 3, 3]))
                normals.append(t.ones([1, 3, 3]))
                mesh_num_tri.append(1)
                continue
            mc_result = skimage.measure.marching_cubes(grid.cpu().numpy(),
                                                       level=0.5)
            mc_result = [t.as_tensor(v.copy()) for v in mc_result[:3]]
            vbuf, ibuf, nbuf = mc_result
            ibuf = ibuf.to(t.int64)
            assert ibuf.shape[0] > 0
            normals.append(nbuf[ibuf])
            triangles.append(vbuf[ibuf])
            mesh_num_tri.append(ibuf.shape[0])
        device = voxel_grid.device
        triangles = t.cat(triangles, dim=0).flip(-1).to(device)
        normals = t.cat(normals, dim=0).flip(-1).to(device)
        mesh_num_tri = util.to_tensor(mesh_num_tri, t.int32).to(device)
        return triangles, normals, mesh_num_tri
示例#15
0
def _resize_fft_input(x: TensorLikeType, dims: Tuple[int, ...],
                      sizes: Tuple[int, ...]) -> TensorLikeType:
    """
    Fixes the shape of x such that x.size(dims[i]) == sizes[i],
    either by zero-padding, or by slicing x starting from 0.
    """
    assert len(dims) == len(sizes)
    must_copy = False
    x_sizes = x.shape
    pad_amount = [0] * len(x_sizes) * 2
    for i in range(len(dims)):
        if sizes[i] == -1:
            continue

        if x_sizes[dims[i]] < sizes[i]:
            must_copy = True
            pad_idx = len(pad_amount) - 2 * dims[i] - 1
            pad_amount[pad_idx] = sizes[i] - x_sizes[dims[i]]

        if x_sizes[dims[i]] > sizes[i]:
            x = x.narrow(dims[i], 0, sizes[i])

    return torch.constant_pad_nd(x, pad_amount) if must_copy else x
示例#16
0
    def forward(self,
                grid2d: t.Tensor,
                voxel_projection_matrix: t.Tensor,
                voxel_sample_location: t.Tensor,
                outside_value: float = 0,
                flip_x=False,
                flip_y=False):
        """The forward pass.

    Args:
      grid2d: The 2D grid, float32[batch_size, num_channels, height, width].
      voxel_projection_matrix: Matrix that projects voxel centers onto the screen,
        float32[batch_size, 4, 4].
      voxel_sample_location: 3D sample location within the voxels, float32[3].
      outside_value: Value used to fill the channels for voxels whose
        projected position is outside the 2D grid, float32[]
      flip_x: Whether to flip the 2D grid along the X dimension. This can be
        used to correct for a right/left handed 3D coordinate system issues.
      flip_y: Whether to flip the 2D grid along the Y dimension. This can be
        used to correct for a right/left handed 3D coordinate system issues.

    Returns:
      The resulting 3D grid, float32[batch_size, num_channels, depth, height,
      width]. The content of cell [b, c, z, y, x] in the result will be equal to
      grid2d[b, c, py, px], where
      (px, py, _) = affine_transform(
          voxel_projection_matrix, (x, y, z, 1)) * (height, width, 1).
      If (b, py, px) lies outside the 2D image, the content of the cell in all
      channels will be equal to outside_value.

    """
        grid2d = util.to_tensor(grid2d, t.float32)
        assert len(grid2d.shape) == 4
        voxel_sample_location = util.to_tensor(voxel_sample_location,
                                               t.float32)
        assert voxel_sample_location.shape == (grid2d.shape[0], 3)

        compressed_grid2d = self.compress_channels(grid2d)
        batch_size, channels, height, width = compressed_grid2d.shape

        voxel_projection_matrix = util.to_tensor(voxel_projection_matrix,
                                                 t.float32)
        assert voxel_projection_matrix.shape == (batch_size, 4, 4)

        voxel_centers = self.voxel_centers
        grid_depth, grid_height, grid_width, _ = voxel_centers.shape
        # shape: [batch, depth, height, width, 3]
        voxel_centers = (voxel_centers[None].expand(batch_size, grid_depth,
                                                    grid_height, grid_width,
                                                    3).contiguous())
        voxel_centers = (voxel_centers +
                         voxel_sample_location[:, None, None, None, :])
        # shape: [batch, depth * height * width, 3]
        voxel_centers = voxel_centers.reshape([batch_size, -1, 3])

        # Project the voxel centers onto the screen
        projected_centers = transformations.transform_points_homogeneous(
            voxel_centers, voxel_projection_matrix, w=1)
        projected_centers = projected_centers.reshape(
            [batch_size, grid_depth, grid_height, grid_width, 4])

        camera_depth = projected_centers[..., 2]
        projected_centers = projected_centers[..., :3] / projected_centers[...,
                                                                           3:4]

        # XY range in OpenGL camera space is [-1:1, -1:1]. Transform to [0:1, 0:1].
        projected_centers = projected_centers[..., :2] / 2 + 0.5

        if flip_y:
            projected_centers = projected_centers * (1, -1) + (0, 1)
        if flip_x:
            projected_centers = projected_centers * (-1, 1) + (1, 0)

        # projected_centers contains (x, y) coordinates in [0, 1]^2 at this point.
        # Convert to indices into 2D grid.
        wh = projected_centers.new_tensor([[[[[width, height]]]]],
                                          dtype=t.float32)
        pixel_indices = (projected_centers * wh).to(t.int64)
        xx, yy = pixel_indices.unbind(-1)  # type: t.Tensor
        bb = t.arange(batch_size, dtype=t.int64, device=grid2d.device)
        bb = bb[:, None, None, None]
        bb = bb.expand(batch_size, grid_depth, grid_height, grid_width)

        # Pad the grid to detect voxels which project outside the image plane
        padded_grid2d = t.constant_pad_nd(compressed_grid2d, [1, 1, 1, 1],
                                          value=outside_value)
        xx = (xx + 1).clamp(0, padded_grid2d.shape[-1] - 1)
        yy = (yy + 1).clamp(0, padded_grid2d.shape[-2] - 1)

        # Sample the 2D grid
        result = padded_grid2d[bb, :, yy, xx].permute([0, 4, 1, 2, 3])
        assert result.shape == (batch_size, channels, grid_depth, grid_height,
                                grid_width)

        # Discard voxels behind the camera
        camera_depth = camera_depth[:, None, :, :, :].expand(result.shape)
        result = t.where(camera_depth >= 0, result,
                         t.ones_like(result) * outside_value)

        return result
示例#17
0
def render_scene(vertex_positions: InputTensor,
                 view_projection_matrix: InputTensor = None,
                 image_size: Tuple[int, int] = (256, 256),
                 normals: InputTensor = None,
                 tex_coords: InputTensor = None,
                 material_ids: InputTensor = None,
                 diffuse_coefficients: InputTensor = None,
                 diffuse_textures: InputTensor = None,
                 diffuse_texture_indices: InputTensor = None,
                 specular_coefficient: InputTensor = None,
                 ambient_coefficients: InputTensor = None,
                 cull_back_facing=True,
                 light_position: InputTensor = None,
                 light_color: InputTensor = (1.0, 1.0, 1.0),
                 ambient_light_color: InputTensor = (0.2, 0.2, 0.2),
                 clear_color: InputTensor = (0, 0, 0, 1),
                 output_type=t.uint8,
                 vertex_shader=None,
                 geometry_shader=None,
                 fragment_shader=None,
                 debug_io_buffer=None,
                 return_rgb=True,
                 cuda_device=None):
    """Renders the given scene.

  Args:
    vertex_positions: The triangle geometry, specified through the triangle
      vertex positions, float32[num_triangles, 3, 3]
    view_projection_matrix: The view projection matrix, float32[4, 4]
    image_size: Desired output image size, (height, width),
    normals: Per-vertex shading normals, float32[num_triangles, 3, 3]. If set to
      None, normals will be computed from the vertex positions.
    tex_coords: Texture coordinate, float32[num_triangles, 3, 2]. If set to
      None, all texture coordinates will be 0.
    material_ids: Per-triangle material indices used to index in the various
      coefficient tensors below, int32[num_triangles]. If set to None, all
      triangles will have the same default material.
    diffuse_coefficients: The diffuse coefficients, one per material,
      float32[num_materials, 3]. Cannot be None if material_ids is not None.
      Must be None if material_ids is None.
    diffuse_textures: uint8[num_textures, height, width, 3]. Can be None if
      there are no textures used in the mesh.
    diffuse_texture_indices: Diffuse texture indices, one per material,
      int32[num_materials]. If set to None, the texture indices for all
      materials will be -1.
    specular_coefficient: Specular coefficients, one per material,
      float32[num_materials, 4]. The first 3 channels are the R, G, and B
      specular coefficients, the last channel is the specular power. If set to
      None, R, G, and B will be 0 for all materials and power will be 2048.
    ambient_coefficients: float32[num_materials, 3]. The ambient coefficients.
      If None, all ambient coefficient will be 0.05.
    cull_back_facing: whether to cull backfacing triangles.
    light_position: float32[3], the light position. If set to None, the light
      will be placed at the camera origin.
    light_color: The light diffuse RGB color, float32[3]
    ambient_light_color: The light ambient RGB color, float32[3]
    clear_color: The RGB color to use when clearing the image, float32[3]
    output_type: The desired output type. Either tf.uint8 or tf.float32.
    vertex_shader: The vertex shader to use. If empty, uses a default shader.
    geometry_shader: The geometry shader. If empty, uses a default shader.
    fragment_shader: The fragment shader. If empty, uses a default shader.
    debug_io_buffer: Aids debugging of shaders. Shaders can communicate with
      host programs through OpenGL input/output buffers. Any tensor passed in
      this argument will be forwarded to the shaders as buffer with name
      "debug_io".
    return_rgb: If true, returns a 3 channel image, otherwise returns a 4
      channel image.
    cuda_device: The index of the GPU to use, given as CUDA device

  Returns:
    The rendered image, dt[height, width, c] where dt is either float32 or uint8
    depending on the value of output_type and c is either 3 or 4, depending on
    return_rgb. If the debug_io_buffer argument was not None, returns a
    tuple containing the rendered image, and the shader output from the
    "debug_io" buffer. The second element of the tuple has the same shape
    and type as debug_io_buffer.

  """
    height, width = image_size
    vertex_positions = util.to_tensor(vertex_positions, t.float32, "cpu")
    assert (len(vertex_positions.shape) == 3
            and vertex_positions.shape[1:] == (3, 3))
    num_triangles = vertex_positions.shape[0]

    if view_projection_matrix is None:
        view_projection_matrix = camera_util.get_default_camera_for_mesh(
            vertex_positions)
    view_projection_matrix = util.to_tensor(view_projection_matrix, t.float32,
                                            "cpu")
    assert view_projection_matrix.shape == (4, 4)

    has_normals = True
    if normals is None:
        normals = t.zeros_like(vertex_positions)
        has_normals = False
    normals = util.to_tensor(normals, t.float32, "cpu")
    assert normals.shape == (num_triangles, 3, 3)

    if tex_coords is None:
        tex_coords = t.zeros([num_triangles, 3, 2], dtype=t.float32)
    tex_coords = util.to_tensor(tex_coords, t.float32, "cpu")
    assert tex_coords.shape == (num_triangles, 3, 2)

    if material_ids is None:
        material_ids = t.zeros([num_triangles], dtype=t.int32)
    material_ids = util.to_tensor(material_ids, t.int32, "cpu")
    assert material_ids.shape == (num_triangles, )
    num_used_materials = material_ids.max().cpu().numpy() + 1  # type: int

    def create_coefficient_array(cur_tensor: InputTensor, num_channels,
                                 default_value):
        arr = cur_tensor
        if arr is None:
            arr = (
                t.ones([num_used_materials, num_channels], dtype=t.float32) *
                t.tensor(default_value))
        arr = util.to_tensor(arr, t.float32, "cpu")
        assert len(arr.shape) == 2
        arr = arr[:num_used_materials]
        assert arr.shape == (num_used_materials, num_channels)
        return arr

    diffuse_coefficients = create_coefficient_array(diffuse_coefficients, 3,
                                                    0.8)
    ambient_coefficients = create_coefficient_array(ambient_coefficients, 3,
                                                    0.05)
    specular_coefficient = create_coefficient_array(specular_coefficient, 4,
                                                    (0, 0, 0, 2048.0))
    if diffuse_texture_indices is None:
        diffuse_texture_indices = t.ones([num_used_materials],
                                         dtype=t.int32) * -1
    diffuse_texture_indices = util.to_tensor(diffuse_texture_indices, t.int32,
                                             "cpu")
    assert len(diffuse_texture_indices.shape) == 1
    diffuse_texture_indices = diffuse_texture_indices[:num_used_materials]
    assert diffuse_texture_indices.shape == (num_used_materials, )
    num_used_textures = diffuse_texture_indices.max().cpu().numpy() + 1
    num_used_textures = max(num_used_textures, 1)

    if diffuse_textures is None:
        diffuse_textures = t.ones([num_used_textures, 1, 1, 3], dtype=t.uint8)
    diffuse_textures = util.to_tensor(diffuse_textures, t.uint8, "cpu")
    assert len(diffuse_textures.shape) == 4
    diffuse_textures = diffuse_textures[:num_used_textures]
    assert (diffuse_textures.shape[0] == num_used_textures
            and diffuse_textures.shape[3] == 3)

    camera_position = t.mv(t.inverse(view_projection_matrix),
                           t.tensor([0, 0, -1, 1], dtype=t.float32))
    camera_position = camera_position[:3] / camera_position[3]
    if light_position is None:
        light_position = camera_position
    light_position = util.to_tensor(light_position, t.float32, "cpu")
    assert light_position.shape == (3, )

    light_color = util.to_tensor(light_color, t.float32, "cpu")
    assert light_color.shape == (3, )

    ambient_light_color = util.to_tensor(ambient_light_color, t.float32, "cpu")
    assert ambient_light_color.shape == (3, )

    ambient_coefficients = t.constant_pad_nd(ambient_coefficients, [0, 1])
    diffuse_coefficients = t.cat([
        diffuse_coefficients,
        diffuse_texture_indices.to(t.float32)[:, np.newaxis]
    ], -1)
    materials = t.cat(
        [ambient_coefficients, diffuse_coefficients, specular_coefficient],
        dim=-1)

    render_args = [
        rasterizer.Uniform("view_projection_matrix", view_projection_matrix),
        rasterizer.Uniform("light_position", light_position),
        rasterizer.Uniform("has_normals", has_normals),
        rasterizer.Uniform("has_texcoords", True),
        rasterizer.Buffer(0, vertex_positions.reshape([-1])),
        rasterizer.Buffer(1, normals.reshape([-1])),
        rasterizer.Buffer(2, tex_coords.reshape([-1])),
        rasterizer.Buffer(3, material_ids.reshape([-1])),
        rasterizer.Buffer(4, materials.reshape([-1])),
        rasterizer.Texture("textures", diffuse_textures, bind_as_array=True),
        rasterizer.Uniform("light_color", light_color),
        rasterizer.Uniform("camera_position", camera_position),
        rasterizer.Uniform("ambient_light_color", ambient_light_color),
        rasterizer.Uniform("cull_backfacing", cull_back_facing),
    ]

    if debug_io_buffer is not None:
        render_args.append(rasterizer.Buffer(5, debug_io_buffer, is_io=True))

    if not geometry_shader:
        geometry_shader = resources.read_text(shaders,
                                              "triangle_renderer.geom")
    if not vertex_shader:
        vertex_shader = resources.read_text(shaders, "noop.vert")
    if not fragment_shader:
        fragment_shader = resources.read_text(shaders,
                                              "point_light_illumination.frag")

    result = rasterizer.gl_simple_render(rasterizer.RenderInput(
        num_points=num_triangles,
        arguments=render_args,
        output_resolution=(height, width),
        clear_color=clear_color,
        output_type=output_type,
        vertex_shader=vertex_shader,
        geometry_shader=geometry_shader,
        fragment_shader=fragment_shader),
                                         cuda_device=cuda_device)

    c = 3 if return_rgb else 4
    if debug_io_buffer is None:
        return result[..., :c]
    else:
        return result[..., :c], render_args[-1].value
示例#18
0
    def beam_search(self,
                    encoder_output,
                    max_output_length,
                    init_idx,
                    eos_idx,
                    pad_idx=None,
                    topk=10,
                    beam_width=40,
                    batch_size=10):
        """
        :param encoder_output: Mx1xE,
        :param max_output_length: length of generated sequence,
        :param init_idx: index of INIT token,
        :param eos_idx: index of EOS token,
        :param pad_idx: index of PAD token.
        :param topk:
        :param beam_width:
        :param batch_size:
        """
        with self.no_tgt_memory_limit():
            # Start with the start of the sentence token
            init_token = torch.full((1, 1), init_idx, dtype=torch.long)

            # Number of sentence to generate
            endnodes = []

            # starting node -  previous node, word id, logp, length
            node = BeamSearchNode(init_token, 0)
            nodes = PriorityQueue()

            # start the queue
            nodes.put((-node.eval(), node))
            qsize = 1

            # start beam search
            # give up when decoding takes too long
            while qsize < 10000:
                if batch_size is None:
                    # fetch the best node
                    score, n = nodes.get()
                    qsize -= 1
                    decoder_input = n.token_ids

                    if decoder_input.shape[
                            0] >= max_output_length or decoder_input[
                                -1, 0] == eos_idx:
                        endnodes.append((score, n))
                        # if we reached maximum # of sentences required
                        if len(endnodes) >= topk:
                            break
                        else:
                            continue

                    # decode for one step using decoder
                    decoder_output = self(
                        encoder_output,
                        decoder_input.to(encoder_output.device))[-1:, :, :]
                    decoder_output = log_softmax(decoder_output, dim=-1).cpu()

                    # PUT HERE REAL BEAM SEARCH OF TOP
                    log_prob, indexes = torch.topk(decoder_output, beam_width)

                    for i in range(beam_width):
                        new_idx = indexes[:, :, i]
                        log_p = log_prob[0, 0, i].item()

                        node = BeamSearchNode(
                            torch.cat((decoder_input, new_idx)),
                            n.log_p + log_p)
                        score = -node.eval()
                        nodes.put((score, node))
                        qsize += 1
                else:
                    assert pad_idx is not None, "Specify pad_idx, please."
                    # fetch batch of the best nodes
                    decoder_inputs, log_ps = [], []
                    while qsize > 0 and len(decoder_inputs) < batch_size:
                        score, n = nodes.get()
                        qsize -= 1
                        token_ids = n.token_ids
                        if token_ids.shape[0] >= max_output_length or token_ids[
                                -1, 0] == eos_idx:
                            endnodes.append((score, n))
                            if len(endnodes) >= topk:
                                break
                            else:
                                continue
                        decoder_inputs.append(token_ids)
                        log_ps.append(n.log_p)

                    if len(endnodes) >= topk:
                        break

                    # pad decoder_inputs
                    tgt_lengths = torch.LongTensor(
                        list(map(lambda x: x.shape[0], decoder_inputs)),
                        device=encoder_output.device)
                    max_length = tgt_lengths.max()
                    decoder_input = torch.cat([
                        torch.constant_pad_nd(
                            inp, (0, 0, 0, max_length - inp.shape[0]), pad_idx)
                        for inp in decoder_inputs
                    ],
                                              dim=1).to(encoder_output.device)

                    # decode batch
                    decoder_outputs = self(encoder_output.expand(
                        -1, decoder_input.shape[1], -1),
                                           decoder_input,
                                           tgt_length=tgt_lengths).cpu()
                    for i, (decoder_input, log_p, tgt_length) in enumerate(
                            zip(decoder_inputs, log_ps, tgt_lengths)):
                        decoder_output = log_softmax(
                            decoder_outputs[tgt_length - 1, i], dim=-1)
                        new_log_ps, new_idxs = torch.topk(
                            decoder_output, beam_width)
                        for new_log_p, new_idx in zip(new_log_ps, new_idxs):
                            node = BeamSearchNode(
                                torch.cat((decoder_input, new_idx.view(1, 1))),
                                log_p + new_log_p.item())
                            score = -node.eval()
                            nodes.put((score, node))
                            qsize += 1
        return sorted(endnodes)
示例#19
0
    def inference_nopad(self, inputs):
        """
        不用pad的语音合成推理。
        有pad的batch模式语音合成推理容易出现合成错误问题,合成效果不稳定。
        用batch模式合成且不用pad还没打通,这个暂用非batch的合成方式。
        Args:
            inputs:

        Returns:

        """
        text_, style_input_, speaker_ids_, f0s_ = inputs
        f0s_ = [f0s_ for w in range(len(text_))]
        mel_outputs_lst, mel_outputs_postnet_lst, gate_outputs_lst, alignments_lst = [], [], [], []
        for text, style_input, speaker_ids, f0s in zip(text_, style_input_,
                                                       speaker_ids_, f0s_):
            text = text[:torch.argmin(text) + 1]  # 去除pad的0
            embedded_inputs = self.embedding(text.unsqueeze(0)).transpose(1, 2)
            embedded_text = self.encoder.inference(embedded_inputs)
            embedded_speakers = self.speaker_embedding(
                speaker_ids.unsqueeze(0))[:, None]
            if hasattr(self, 'gst'):
                if isinstance(style_input, int):
                    query = torch.zeros(1, 1,
                                        self.gst.encoder.ref_enc_gru_size).to(
                                            _device)  # cuda()
                    GST = torch.tanh(self.gst.stl.embed)
                    key = GST[style_input].unsqueeze(0).expand(1, -1, -1)
                    embedded_gst = self.gst.stl.attention(query, key)
                else:
                    embedded_gst = self.gst(style_input.unsqueeze(0))

            embedded_speakers = embedded_speakers.repeat(
                1, embedded_text.size(1), 1)
            if hasattr(self, 'gst'):
                embedded_gst = embedded_gst.repeat(1, embedded_text.size(1), 1)
                encoder_outputs = torch.cat(
                    (embedded_text, embedded_gst, embedded_speakers), dim=2)
            else:
                encoder_outputs = torch.cat((embedded_text, embedded_speakers),
                                            dim=2)

            mel_outputs, gate_outputs, alignments = self.decoder.inference(
                encoder_outputs, f0s)

            mel_outputs_postnet = self.postnet(mel_outputs)
            mel_outputs_postnet = mel_outputs + mel_outputs_postnet

            mel_outputs_lst.append(mel_outputs)
            mel_outputs_postnet_lst.append(mel_outputs_postnet)
            gate_outputs_lst.append(gate_outputs)
            alignments_lst.append(alignments)

        maxlen = max([w.shape[2] for w in mel_outputs_postnet_lst])

        # pad一个很小的负数才是静音
        mel_outputs_postnet_lst = [
            torch.constant_pad_nd(w, (0, maxlen - w.shape[2]), -16)
            for w in mel_outputs_postnet_lst
        ]
        mel_outputs_postnet = torch.cat(mel_outputs_postnet_lst, dim=0)

        # pad一个很小的负数才是静音
        mel_outputs_lst = [
            torch.constant_pad_nd(w, (0, maxlen - w.shape[2]), -16)
            for w in mel_outputs_lst
        ]
        mel_outputs = torch.cat(mel_outputs_lst, dim=0)

        # pad数字1才是截断
        gate_outputs_lst = [
            torch.constant_pad_nd(w, (0, 0, 0, maxlen - w.shape[1]), 1)
            for w in gate_outputs_lst
        ]
        gate_outputs = torch.cat(gate_outputs_lst, dim=0)

        maxlen_text = max([w.shape[0] for w in text_])

        alignments_lst = [
            torch.constant_pad_nd(
                w, (0, maxlen_text - w.shape[2], 0, maxlen - w.shape[1]), 0)
            for w in alignments_lst
        ]
        alignments = torch.cat(alignments_lst, dim=0)
        return self.parse_output(
            [mel_outputs, mel_outputs_postnet, gate_outputs, alignments])