예제 #1
0
    def _create_sinc_convs(self):
        blocks = OrderedDict()

        # SincConvBlock
        out_channels = 128
        self.filters = SincConv(
            self.in_channels,
            out_channels,
            kernel_size=101,
            stride=1,
            fs=self.fs,
            window_func=self.windowing_type,
            scale_type=self.scale_type,
        )
        block = OrderedDict([
            ("Filters", self.filters),
            ("LogCompression", LogCompression()),
            ("BatchNorm", torch.nn.BatchNorm1d(out_channels, affine=True)),
            ("AvgPool", torch.nn.AvgPool1d(2)),
        ])
        blocks["SincConvBlock"] = torch.nn.Sequential(block)
        in_channels = out_channels

        # First convolutional block, connects the sinc output to the front-end "body"
        out_channels = 128
        blocks["DConvBlock1"] = self.gen_lsc_block(
            in_channels,
            out_channels,
            depthwise_kernel_size=25,
            depthwise_stride=2,
            pointwise_groups=0,
            avgpool=True,
            dropout_probability=0.1,
        )
        in_channels = out_channels

        # Second convolutional block, multiple convolutional layers
        out_channels = self.out_channels
        for layer in [2, 3, 4]:
            blocks[f"DConvBlock{layer}"] = self.gen_lsc_block(
                in_channels,
                out_channels,
                depthwise_kernel_size=9,
                depthwise_stride=1)
            in_channels = out_channels

        # Third Convolutional block, acts as coupling to encoder
        out_channels = self.out_channels
        blocks["DConvBlock5"] = self.gen_lsc_block(
            in_channels,
            out_channels,
            depthwise_kernel_size=7,
            depthwise_stride=1,
            pointwise_groups=0,
        )

        self.blocks = torch.nn.Sequential(blocks)
예제 #2
0
def test_sinc_filters():
    filters = SincConv(
        in_channels=1, out_channels=128, kernel_size=101, stride=1, fs=16000
    )
    x = torch.randn([50, 1, 400], requires_grad=True)
    y = filters(x)
    assert y.shape == torch.Size([50, 128, 300])
    # now test multichannel
    filters = SincConv(
        in_channels=2, out_channels=128, kernel_size=101, stride=1, fs=16000
    )
    x = torch.randn([50, 2, 400], requires_grad=True)
    y = filters(x)
    assert y.shape == torch.Size([50, 128, 300])
예제 #3
0
class LightweightSincConvs(AbsPreEncoder):
    """Lightweight Sinc Convolutions.

    Provide a frontend for raw audio input.
    https://arxiv.org/abs/2010.07597
    """
    def __init__(
        self,
        fs: Union[int, str, float] = 16000,
        in_channels: int = 1,
        out_channels: int = 256,
        activation_type: str = "leakyrelu",
        dropout_type: str = "dropout",
        windowing_type: str = "hamming",
        scale_type: str = "mel",
    ):
        """Initialize the module.

        Args:
            fs: Sample rate.
            in_channels: Number of input channels.
            out_channels: Number of output channels (for each input channel).
            activation_type: Choice of activation function.
            dropout_type: Choice of dropout function.
            windowing_type: Choice of windowing function.
            scale_type:  Choice of filter-bank initialization scale.
        """
        assert check_argument_types()
        super().__init__()
        if isinstance(fs, str):
            fs = humanfriendly.parse_size(fs)
        self.fs = fs
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.activation_type = activation_type
        self.dropout_type = dropout_type
        self.windowing_type = windowing_type
        self.scale_type = scale_type

        self.choices_dropout = {
            "dropout": torch.nn.Dropout,
            "spatial": SpatialDropout,
            "dropout2d": torch.nn.Dropout2d,
        }
        if dropout_type not in self.choices_dropout:
            raise NotImplementedError(
                f"Dropout type has to be one of "
                f"{list(self.choices_dropout.keys())}", )

        self.choices_activation = {
            "leakyrelu": torch.nn.LeakyReLU,
            "relu": torch.nn.ReLU,
        }
        if activation_type not in self.choices_activation:
            raise NotImplementedError(
                f"Activation type has to be one of "
                f"{list(self.choices_activation.keys())}", )

        # initialization
        self._create_sinc_convs()
        # Sinc filters require custom initialization
        self.espnet_initialization_fn()

    def _create_sinc_convs(self):
        blocks = OrderedDict()

        # SincConvBlock
        out_channels = 128
        self.filters = SincConv(
            self.in_channels,
            out_channels,
            kernel_size=101,
            stride=1,
            fs=self.fs,
            window_func=self.windowing_type,
            scale_type=self.scale_type,
        )
        block = OrderedDict([
            ("Filters", self.filters),
            ("LogCompression", LogCompression()),
            ("BatchNorm", torch.nn.BatchNorm1d(out_channels, affine=True)),
            ("AvgPool", torch.nn.AvgPool1d(2)),
        ])
        blocks["SincConvBlock"] = torch.nn.Sequential(block)
        in_channels = out_channels

        # First convolutional block, connects the sinc output to the front-end "body"
        out_channels = 128
        blocks["DConvBlock1"] = self.gen_lsc_block(
            in_channels,
            out_channels,
            depthwise_kernel_size=25,
            depthwise_stride=2,
            pointwise_groups=0,
            avgpool=True,
            dropout_probability=0.1,
        )
        in_channels = out_channels

        # Second convolutional block, multiple convolutional layers
        out_channels = self.out_channels
        for layer in [2, 3, 4]:
            blocks[f"DConvBlock{layer}"] = self.gen_lsc_block(
                in_channels,
                out_channels,
                depthwise_kernel_size=9,
                depthwise_stride=1)
            in_channels = out_channels

        # Third Convolutional block, acts as coupling to encoder
        out_channels = self.out_channels
        blocks["DConvBlock5"] = self.gen_lsc_block(
            in_channels,
            out_channels,
            depthwise_kernel_size=7,
            depthwise_stride=1,
            pointwise_groups=0,
        )

        self.blocks = torch.nn.Sequential(blocks)

    def gen_lsc_block(
        self,
        in_channels: int,
        out_channels: int,
        depthwise_kernel_size: int = 9,
        depthwise_stride: int = 1,
        depthwise_groups=None,
        pointwise_groups=0,
        dropout_probability: float = 0.15,
        avgpool=False,
    ):
        """Generate a block for lightweight Sinc convolutions.

        Args:
            in_channels: Number of input channels.
            out_channels: Number of output channels.
            depthwise_kernel_size: Kernel size of the depthwise convolution.
            depthwise_stride: Stride of the depthwise convolution.
            depthwise_groups: Number of groups of the depthwise convolution.
            pointwise_groups: Number of groups of the pointwise convolution.
            dropout_probability: Dropout probability in the block.
            avgpool: If True, an AvgPool layer is inserted.

        Returns:
            torch.nn.Sequential: Neural network building block.
        """
        block = OrderedDict()
        if not depthwise_groups:
            # GCD(in_channels, out_channels) to prevent size mismatches
            depthwise_groups, r = in_channels, out_channels
            while r != 0:
                depthwise_groups, r = depthwise_groups, depthwise_groups % r
        block["depthwise"] = torch.nn.Conv1d(
            in_channels,
            out_channels,
            depthwise_kernel_size,
            depthwise_stride,
            groups=depthwise_groups,
        )
        if pointwise_groups:
            block["pointwise"] = torch.nn.Conv1d(out_channels,
                                                 out_channels,
                                                 1,
                                                 1,
                                                 groups=pointwise_groups)
        block["activation"] = self.choices_activation[self.activation_type]()
        block["batchnorm"] = torch.nn.BatchNorm1d(out_channels, affine=True)
        if avgpool:
            block["avgpool"] = torch.nn.AvgPool1d(2)
        block["dropout"] = self.choices_dropout[self.dropout_type](
            dropout_probability)
        return torch.nn.Sequential(block)

    def espnet_initialization_fn(self):
        """Initialize sinc filters with filterbank values."""
        self.filters.init_filters()
        for block in self.blocks:
            for layer in block:
                if type(layer) == torch.nn.BatchNorm1d and layer.affine:
                    layer.weight.data[:] = 1.0
                    layer.bias.data[:] = 0.0

    def forward(
            self, input: torch.Tensor,
            input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """Forward function."""
        # Transform input data:
        #   (B, T, C_in, D_in) -> (B*T, C_in, D_in)
        B, T, C_in, D_in = input.size()
        input_frames = input.view(B * T, C_in, D_in)
        output_frames = self.blocks.forward(input_frames)

        # ---TRANSFORM: (B*T, C_out, D_out) -> (B, T, C_out*D_out)
        _, C_out, D_out = output_frames.size()
        output_frames = output_frames.view(B, T, C_out * D_out)
        return output_frames, input_lengths  # no state in this layer

    def output_size(self) -> int:
        """Get the output size."""
        return self.out_channels * self.in_channels
예제 #4
0
def plot_filter_kernels(filters: torch.Tensor, sample_rate: int, args):
    """Plot the Sinc filter kernels.

    Args:
        filters (torch.Tensor): Filter parameters.
        sample_rate (int): Sample rate of Signal.
        args (dict): Dictionary with output options.
    """
    from espnet2.layers.sinc_conv import SincConv

    print("When plotting filter kernels, make sure the script has the"
          " correct SincConv settings (currently hard-coded).")
    convs = SincConv(1, 128, 101)

    # unlearned
    convs._create_filters(convs.f.device)
    pre_kernels = convs.sinc_filters.detach().numpy()

    pre_filters = convs.f.detach().numpy()
    f_mins = np.abs(pre_filters[:, 0])
    f_maxs = np.abs(
        pre_filters[:, 0]) + np.abs(pre_filters[:, 1] - pre_filters[:, 0])
    F_mins, F_maxs = f_mins * sample_rate, f_maxs * sample_rate
    pre_F_mins, pre_F_maxs = np.round(F_mins).astype(
        np.int), np.round(F_maxs).astype(np.int)

    # learned
    convs.f = torch.nn.Parameter(torch.Tensor(filters))
    convs._create_filters(convs.f.device)
    kernels = convs.sinc_filters.detach().numpy()

    f_mins = np.abs(filters[:, 0])
    f_maxs = np.abs(filters[:, 0]) + np.abs(filters[:, 1] - filters[:, 0])
    F_mins, F_maxs = f_mins * sample_rate, f_maxs * sample_rate
    F_mins, F_maxs = np.round(F_mins).astype(np.int), np.round(F_maxs).astype(
        np.int)
    F_mins, F_maxs = np.clip(F_mins, 0, sample_rate / 2.0), np.clip(
        F_maxs, 0, sample_rate / 2.0)

    x_f = np.linspace(0.0, np.max(F_maxs), int(np.max(F_maxs)) + 1)
    x = np.arange(kernels.shape[2])
    if args.all:
        for i in range(len(kernels)):
            pre_kernel = pre_kernels[i][0]
            plt.clf()
            plt.xticks([])
            plt.yticks([])
            plt.plot(x, pre_kernel)
            img_name = "filter_pre_kernel_%s.%s" % (str(i).zfill(2),
                                                    args.filetype)
            img_path = str(args.out_folder / img_name)
            plt.savefig(img_path, bbox_inches="tight")
            print("Plotted %s" % img_path)

            kernel = kernels[i][0]
            plt.clf()
            plt.xticks([])
            plt.yticks([])
            plt.plot(x, kernel)
            img_name = "filter_kernel_%s.%s" % (str(i).zfill(2), args.filetype)
            img_path = str(args.out_folder / img_name)
            plt.savefig(img_path, bbox_inches="tight")
            print("Plotted %s" % img_path)

            plt.clf()
            plt.xlabel("kernel index")
            plt.plot(x, kernel)
            plt.plot(x, pre_kernel, "--", alpha=0.5)
            img_name = "filter_kernel_both_%s.%s" % (str(i).zfill(2),
                                                     args.filetype)
            img_path = str(args.out_folder / img_name)
            plt.savefig(img_path, bbox_inches="tight")
            print("Plotted %s" % img_path)

            y = np.zeros_like(x_f)
            y[F_mins[i]:F_maxs[i]] = 1.0
            plt.clf()
            plt.plot(x_f, y)
            img_name = "filter_freq_%s.%s" % (str(i).zfill(2), args.filetype)
            img_path = str(args.out_folder / img_name)
            plt.savefig(img_path, bbox_inches="tight")
            print("Plotted %s" % img_path)

            pre_y = np.zeros_like(x_f)
            pre_y[pre_F_mins[i]:pre_F_maxs[i]] = 1.0
            plt.clf()
            plt.plot(x_f, y)
            plt.plot(x_f, pre_y)
            img_name = "filter_freq_both_%s.%s" % (str(i).zfill(2),
                                                   args.filetype)
            img_path = args.out_folder / img_name
            plt.savefig(img_path, bbox_inches="tight")
            print("Plotted %s" % img_path)

    plt.clf()
    filters = [32, 71, 113, 126]
    fig, axs = plt.subplots(2, 2, sharex=True, sharey="row")

    axs[0, 0].plot(x, kernels[filters[0]][0])
    axs[0, 0].plot(x, pre_kernels[filters[0]][0], "--", alpha=0.5)
    axs[0, 1].plot(x, kernels[filters[1]][0])
    axs[0, 1].plot(x, pre_kernels[filters[1]][0], "--", alpha=0.5)
    axs[1, 0].plot(x, kernels[filters[2]][0])
    axs[1, 0].plot(x, pre_kernels[filters[2]][0], "--", alpha=0.5)
    axs[1, 1].plot(x, kernels[filters[3]][0])
    axs[1, 1].plot(x, pre_kernels[filters[3]][0], "--", alpha=0.5)

    img_name = "filter_kernel_ensemble2.%s" % (args.filetype)
    img_path = str(args.out_folder / img_name)
    plt.savefig(img_path, bbox_inches="tight")
    plt.close(fig)
    print("Plotted %s" % img_path)
class LightweightSincConvs(AbsPreEncoder):
    """Lightweight Sinc Convolutions.

    Instead of using precomputed features, end-to-end speech recognition
    can also be done directly from raw audio using sinc convolutions, as
    described in "Lightweight End-to-End Speech Recognition from Raw Audio
    Data Using Sinc-Convolutions" by Kürzinger et al.
    https://arxiv.org/abs/2010.07597

    To use Sinc convolutions in your model instead of the default f-bank
    frontend, set this module as your pre-encoder with `preencoder: sinc`
    and use the input of the sliding window frontend with
    `frontend: sliding_window` in your yaml configuration file.
    So that the process flow is:

    Frontend (SlidingWindow) -> SpecAug -> Normalization ->
    Pre-encoder (LightweightSincConvs) -> Encoder -> Decoder

    Note that this method also performs data augmentation in time domain
    (vs. in spectral domain in the default frontend).
    Use `plot_sinc_filters.py` to visualize the learned Sinc filters.
    """
    def __init__(
        self,
        fs: Union[int, str, float] = 16000,
        in_channels: int = 1,
        out_channels: int = 256,
        activation_type: str = "leakyrelu",
        dropout_type: str = "dropout",
        windowing_type: str = "hamming",
        scale_type: str = "mel",
    ):
        """Initialize the module.

        Args:
            fs: Sample rate.
            in_channels: Number of input channels.
            out_channels: Number of output channels (for each input channel).
            activation_type: Choice of activation function.
            dropout_type: Choice of dropout function.
            windowing_type: Choice of windowing function.
            scale_type:  Choice of filter-bank initialization scale.
        """
        assert check_argument_types()
        super().__init__()
        if isinstance(fs, str):
            fs = humanfriendly.parse_size(fs)
        self.fs = fs
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.activation_type = activation_type
        self.dropout_type = dropout_type
        self.windowing_type = windowing_type
        self.scale_type = scale_type

        self.choices_dropout = {
            "dropout": torch.nn.Dropout,
            "spatial": SpatialDropout,
            "dropout2d": torch.nn.Dropout2d,
        }
        if dropout_type not in self.choices_dropout:
            raise NotImplementedError(
                f"Dropout type has to be one of "
                f"{list(self.choices_dropout.keys())}", )

        self.choices_activation = {
            "leakyrelu": torch.nn.LeakyReLU,
            "relu": torch.nn.ReLU,
        }
        if activation_type not in self.choices_activation:
            raise NotImplementedError(
                f"Activation type has to be one of "
                f"{list(self.choices_activation.keys())}", )

        # initialization
        self._create_sinc_convs()
        # Sinc filters require custom initialization
        self.espnet_initialization_fn()

    def _create_sinc_convs(self):
        blocks = OrderedDict()

        # SincConvBlock
        out_channels = 128
        self.filters = SincConv(
            self.in_channels,
            out_channels,
            kernel_size=101,
            stride=1,
            fs=self.fs,
            window_func=self.windowing_type,
            scale_type=self.scale_type,
        )
        block = OrderedDict([
            ("Filters", self.filters),
            ("LogCompression", LogCompression()),
            ("BatchNorm", torch.nn.BatchNorm1d(out_channels, affine=True)),
            ("AvgPool", torch.nn.AvgPool1d(2)),
        ])
        blocks["SincConvBlock"] = torch.nn.Sequential(block)
        in_channels = out_channels

        # First convolutional block, connects the sinc output to the front-end "body"
        out_channels = 128
        blocks["DConvBlock1"] = self.gen_lsc_block(
            in_channels,
            out_channels,
            depthwise_kernel_size=25,
            depthwise_stride=2,
            pointwise_groups=0,
            avgpool=True,
            dropout_probability=0.1,
        )
        in_channels = out_channels

        # Second convolutional block, multiple convolutional layers
        out_channels = self.out_channels
        for layer in [2, 3, 4]:
            blocks[f"DConvBlock{layer}"] = self.gen_lsc_block(
                in_channels,
                out_channels,
                depthwise_kernel_size=9,
                depthwise_stride=1)
            in_channels = out_channels

        # Third Convolutional block, acts as coupling to encoder
        out_channels = self.out_channels
        blocks["DConvBlock5"] = self.gen_lsc_block(
            in_channels,
            out_channels,
            depthwise_kernel_size=7,
            depthwise_stride=1,
            pointwise_groups=0,
        )

        self.blocks = torch.nn.Sequential(blocks)

    def gen_lsc_block(
        self,
        in_channels: int,
        out_channels: int,
        depthwise_kernel_size: int = 9,
        depthwise_stride: int = 1,
        depthwise_groups=None,
        pointwise_groups=0,
        dropout_probability: float = 0.15,
        avgpool=False,
    ):
        """Generate a convolutional block for Lightweight Sinc convolutions.

        Each block consists of either a depthwise or a depthwise-separable
        convolutions together with dropout, (batch-)normalization layer, and
        an optional average-pooling layer.

        Args:
            in_channels: Number of input channels.
            out_channels: Number of output channels.
            depthwise_kernel_size: Kernel size of the depthwise convolution.
            depthwise_stride: Stride of the depthwise convolution.
            depthwise_groups: Number of groups of the depthwise convolution.
            pointwise_groups: Number of groups of the pointwise convolution.
            dropout_probability: Dropout probability in the block.
            avgpool: If True, an AvgPool layer is inserted.

        Returns:
            torch.nn.Sequential: Neural network building block.
        """
        block = OrderedDict()
        if not depthwise_groups:
            # GCD(in_channels, out_channels) to prevent size mismatches
            depthwise_groups, r = in_channels, out_channels
            while r != 0:
                depthwise_groups, r = depthwise_groups, depthwise_groups % r
        block["depthwise"] = torch.nn.Conv1d(
            in_channels,
            out_channels,
            depthwise_kernel_size,
            depthwise_stride,
            groups=depthwise_groups,
        )
        if pointwise_groups:
            block["pointwise"] = torch.nn.Conv1d(out_channels,
                                                 out_channels,
                                                 1,
                                                 1,
                                                 groups=pointwise_groups)
        block["activation"] = self.choices_activation[self.activation_type]()
        block["batchnorm"] = torch.nn.BatchNorm1d(out_channels, affine=True)
        if avgpool:
            block["avgpool"] = torch.nn.AvgPool1d(2)
        block["dropout"] = self.choices_dropout[self.dropout_type](
            dropout_probability)
        return torch.nn.Sequential(block)

    def espnet_initialization_fn(self):
        """Initialize sinc filters with filterbank values."""
        self.filters.init_filters()
        for block in self.blocks:
            for layer in block:
                if type(layer) == torch.nn.BatchNorm1d and layer.affine:
                    layer.weight.data[:] = 1.0
                    layer.bias.data[:] = 0.0

    def forward(
            self, input: torch.Tensor,
            input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """Apply Lightweight Sinc Convolutions.

        The input shall be formatted as (B, T, C_in, D_in)
        with B as batch size, T as time dimension, C_in as channels,
        and D_in as feature dimension.

        The output will then be (B, T, C_out*D_out)
        with C_out and D_out as output dimensions.

        The current module structure only handles D_in=400, so that D_out=1.
        Remark for the multichannel case: C_out is the number of out_channels
        given at initialization multiplied with C_in.
        """
        # Transform input data:
        #   (B, T, C_in, D_in) -> (B*T, C_in, D_in)
        B, T, C_in, D_in = input.size()
        input_frames = input.view(B * T, C_in, D_in)
        output_frames = self.blocks.forward(input_frames)

        # ---TRANSFORM: (B*T, C_out, D_out) -> (B, T, C_out*D_out)
        _, C_out, D_out = output_frames.size()
        output_frames = output_frames.view(B, T, C_out * D_out)
        return output_frames, input_lengths  # no state in this layer

    def output_size(self) -> int:
        """Get the output size."""
        return self.out_channels * self.in_channels
예제 #6
0
def test_sinc_filter_output_size():
    sinc_conv = SincConv(in_channels=1, out_channels=128, kernel_size=101)
    assert sinc_conv.get_odim(400) == 300
예제 #7
0
def test_sinc_filter_static_functions():
    N = 400
    x = torch.linspace(1, N, N)
    print(f"no window function: {SincConv.none_window(x)}")
    print(f"hamming window function: {SincConv.hamming_window(x)}")
    SincConv.sinc(torch.tensor(50.0))