Exemplo n.º 1
0
    def __init__(self, etype, idim, elayers, eunits, eprojs, subsample, dropout, in_channel=1):
        super(Encoder, self).__init__()
        typ = etype.lstrip("vgg").lstrip("sinc").rstrip("p")
        logging.info("Error: hello")
        if typ not in ['lstm', 'gru', 'blstm', 'bgru', 'bligru', 'blstm']:
            logging.error("Error: need to specify an appropriate encoder architecture")

        if etype.startswith("vgg"):
            if etype[-1] == "p":
                self.enc = torch.nn.ModuleList([VGG2L(in_channel),
                                                RNNP(get_vgg2l_odim(idim, in_channel=in_channel), elayers, eunits,
                                                     eprojs,
                                                     subsample, dropout, typ=typ)])
                logging.info('Use CNN-VGG + ' + typ.upper() + 'P for encoder')
            elif etype[-1] == "s":
                self.enc = torch.nn.ModuleList([SincNet(sincnet=True, nb_filters=512),
                                                RNN(256, elayers, eunits,
                                                    eprojs,
                                                    dropout, typ=typ)])
                logging.info('Use Raw SincNet + ' + typ.upper() + ' for encoder')
            else:
                self.enc = torch.nn.ModuleList([VGG2L(in_channel),
                                                RNN(get_vgg2l_odim(idim, in_channel=in_channel), elayers, eunits,
                                                    eprojs,
                                                    dropout, typ=typ)])
                logging.info('Use CNN-VGG + ' + typ.upper() + ' for encoder')
        else:
            if etype[-1] == "p":
                self.enc = torch.nn.ModuleList(
                    [RNNP(idim, elayers, eunits, eprojs, subsample, dropout, typ=typ)])
                logging.info(typ.upper() + ' with every-layer projection for encoder')
            else:
                self.enc = torch.nn.ModuleList([RNN(idim, elayers, eunits, eprojs, dropout, typ=typ)])
                logging.info(typ.upper() + ' without projection for encoder')
Exemplo n.º 2
0
    def __init__(self,
                 etype,
                 idim,
                 elayers,
                 eunits,
                 eprojs,
                 subsample,
                 dropout,
                 in_channel=1):
        super(Encoder, self).__init__()
        typ = etype.lstrip("vgg").rstrip("p")
        if typ not in ["lstm", "gru", "blstm", "bgru"]:
            logging.error(
                "Error: need to specify an appropriate encoder architecture")

        if etype.startswith("vgg"):
            if etype[-1] == "p":
                self.enc = torch.nn.ModuleList([
                    VGG2L(in_channel),
                    RNNP(
                        get_vgg2l_odim(idim, in_channel=in_channel),
                        elayers,
                        eunits,
                        eprojs,
                        subsample,
                        dropout,
                        typ=typ,
                    ),
                ])
                logging.info("Use CNN-VGG + " + typ.upper() + "P for encoder")
            else:
                self.enc = torch.nn.ModuleList([
                    VGG2L(in_channel),
                    RNN(
                        get_vgg2l_odim(idim, in_channel=in_channel),
                        elayers,
                        eunits,
                        eprojs,
                        dropout,
                        typ=typ,
                    ),
                ])
                logging.info("Use CNN-VGG + " + typ.upper() + " for encoder")
        else:
            if etype[-1] == "p":
                self.enc = torch.nn.ModuleList([
                    RNNP(idim,
                         elayers,
                         eunits,
                         eprojs,
                         subsample,
                         dropout,
                         typ=typ)
                ])
                logging.info(typ.upper() +
                             " with every-layer projection for encoder")
            else:
                self.enc = torch.nn.ModuleList(
                    [RNN(idim, elayers, eunits, eprojs, dropout, typ=typ)])
                logging.info(typ.upper() + " without projection for encoder")
Exemplo n.º 3
0
    def __init__(self,
                 etype,
                 idim,
                 elayers,
                 eunits,
                 eprojs,
                 subsample,
                 dropout,
                 in_channel=1):
        super(Encoder, self).__init__()
        with self.init_scope():
            if etype == 'blstm':
                self.enc1 = BLSTM(idim, elayers, eunits, eprojs, dropout)
                logging.info('BLSTM without projection for encoder')
            elif etype == 'blstmp':
                self.enc1 = BLSTMP(idim, elayers, eunits, eprojs, subsample,
                                   dropout)
                logging.info('BLSTM with every-layer projection for encoder')
            elif etype == 'vggblstmp':
                self.enc1 = VGG2L(in_channel)
                self.enc2 = BLSTMP(get_vgg2l_odim(idim, in_channel=in_channel),
                                   elayers, eunits, eprojs, subsample, dropout)
                logging.info('Use CNN-VGG + BLSTMP for encoder')
            elif etype == 'vggblstm':
                self.enc1 = VGG2L(in_channel)
                self.enc2 = BLSTM(get_vgg2l_odim(idim, in_channel=in_channel),
                                  elayers, eunits, eprojs, dropout)
                logging.info('Use CNN-VGG + BLSTM for encoder')
            else:
                logging.error(
                    "Error: need to specify an appropriate encoder archtecture"
                )
                sys.exit()

        self.etype = etype
Exemplo n.º 4
0
 def __init__(self,
              etype,
              idim,
              elayers,
              eunits,
              eprojs,
              subsample,
              dropout,
              in_channel=1):
     super(Encoder, self).__init__()
     typ = etype.lstrip("vgg").rstrip("p")
     if typ not in ['lstm', 'gru', 'blstm', 'bgru']:
         logging.error(
             "Error: need to specify an appropriate encoder architecture")
     with self.init_scope():
         if etype.startswith("vgg"):
             if etype[-1] == "p":
                 self.enc = chainer.Sequential(
                     VGG2L(in_channel),
                     RNNP(get_vgg2l_odim(idim, in_channel=in_channel),
                          elayers,
                          eunits,
                          eprojs,
                          subsample,
                          dropout,
                          typ=typ))
                 logging.info('Use CNN-VGG + ' + typ.upper() +
                              'P for encoder')
             else:
                 self.enc = chainer.Sequential(
                     VGG2L(in_channel),
                     RNN(get_vgg2l_odim(idim, in_channel=in_channel),
                         elayers,
                         eunits,
                         eprojs,
                         dropout,
                         typ=typ))
                 logging.info('Use CNN-VGG + ' + typ.upper() +
                              ' for encoder')
         else:
             if etype[-1] == "p":
                 self.enc = chainer.Sequential(
                     RNNP(idim,
                          elayers,
                          eunits,
                          eprojs,
                          subsample,
                          dropout,
                          typ=typ))
                 logging.info(typ.upper() +
                              ' with every-layer projection for encoder')
             else:
                 self.enc = chainer.Sequential(
                     RNN(idim, elayers, eunits, eprojs, dropout, typ=typ))
                 logging.info(typ.upper() +
                              ' without projection for encoder')
Exemplo n.º 5
0
 def __init__(self,
              etype,
              idim,
              elayers,
              eunits,
              eprojs,
              subsample,
              dropout,
              in_channel=1):
     super(Encoder, self).__init__()
     typ = etype.lstrip("vgg").lstrip("b").rstrip("p")
     if typ != "lstm" and typ != "gru":
         logging.error(
             "Error: need to specify an appropriate encoder architecture")
     if etype.startswith("vgg"):
         if etype[-1] == "p":
             self.enc = torch.nn.ModuleList([
                 VGG2L(in_channel),
                 BRNNP(get_vgg2l_odim(idim, in_channel=in_channel),
                       elayers,
                       eunits,
                       eprojs,
                       subsample,
                       dropout,
                       typ=typ)
             ])
             logging.info('Use CNN-VGG + B' + typ.upper() + 'P for encoder')
         else:
             self.enc = torch.nn.ModuleList([
                 VGG2L(in_channel),
                 BRNN(get_vgg2l_odim(idim, in_channel=in_channel),
                      elayers,
                      eunits,
                      eprojs,
                      dropout,
                      typ=typ)
             ])
             logging.info('Use CNN-VGG + B' + typ.upper() + ' for encoder')
     else:
         if etype[-1] == "p":
             self.enc = torch.nn.ModuleList([
                 BRNNP(idim,
                       elayers,
                       eunits,
                       eprojs,
                       subsample,
                       dropout,
                       typ=typ)
             ])
             logging.info('B' + typ.upper() +
                          ' with every-layer projection for encoder')
         else:
             self.enc = torch.nn.ModuleList(
                 [BRNN(idim, elayers, eunits, eprojs, dropout, typ=typ)])
             logging.info('B' + typ.upper() +
                          ' without projection for encoder')
Exemplo n.º 6
0
    def __init__(
        self,
        input_size: int,
        rnn_type: str = "lstm",
        bidirectional: bool = True,
        use_projection: bool = True,
        num_layers: int = 4,
        hidden_size: int = 320,
        output_size: int = 320,
        dropout: float = 0.0,
        in_channel: int = 1,
    ):
        assert check_argument_types()
        super().__init__()
        self._output_size = output_size
        self.rnn_type = rnn_type
        self.bidirectional = bidirectional
        self.use_projection = use_projection
        if rnn_type not in {"lstm", "gru"}:
            raise ValueError(f"Not supported rnn_type={rnn_type}")

        # Subsample is not used for VGGRNN
        subsample = np.ones(num_layers + 1, dtype=np.int64)
        rnn_type = ("b" if bidirectional else "") + rnn_type
        if use_projection:
            self.enc = torch.nn.ModuleList([
                VGG2L(in_channel),
                RNNP(
                    get_vgg2l_odim(input_size, in_channel=in_channel),
                    num_layers,
                    hidden_size,
                    output_size,
                    subsample,
                    dropout,
                    typ=rnn_type,
                ),
            ])

        else:
            self.enc = torch.nn.ModuleList([
                VGG2L(in_channel),
                RNN(
                    get_vgg2l_odim(input_size, in_channel=in_channel),
                    num_layers,
                    hidden_size,
                    output_size,
                    dropout,
                    typ=rnn_type,
                ),
            ])
Exemplo n.º 7
0
    def __init__(
        self,
        etype,
        idim,
        elayers_sd,
        elayers_rec,
        eunits,
        eprojs,
        subsample,
        dropout,
        num_spkrs=2,
        in_channel=1,
    ):
        """Initialize the encoder of single-channel multi-speaker ASR."""
        super(EncoderMix, self).__init__()
        typ = etype.lstrip("vgg").rstrip("p")
        if typ not in ["lstm", "gru", "blstm", "bgru"]:
            logging.error(
                "Error: need to specify an appropriate encoder architecture")
        if etype.startswith("vgg"):
            if etype[-1] == "p":
                self.enc_mix = torch.nn.ModuleList([VGG2L(in_channel)])
                self.enc_sd = torch.nn.ModuleList([
                    torch.nn.ModuleList([
                        RNNP(
                            get_vgg2l_odim(idim, in_channel=in_channel),
                            elayers_sd,
                            eunits,
                            eprojs,
                            subsample[:elayers_sd + 1],
                            dropout,
                            typ=typ,
                        )
                    ]) for i in range(num_spkrs)
                ])
                self.enc_rec = torch.nn.ModuleList([
                    RNNP(
                        eprojs,
                        elayers_rec,
                        eunits,
                        eprojs,
                        subsample[elayers_sd:],
                        dropout,
                        typ=typ,
                    )
                ])
                logging.info("Use CNN-VGG + B" + typ.upper() + "P for encoder")
            else:
                logging.error(
                    f"Error: need to specify an appropriate encoder architecture. "
                    f"Illegal name {etype}")
                sys.exit()
        else:
            logging.error(
                f"Error: need to specify an appropriate encoder architecture. "
                f"Illegal name {etype}")
            sys.exit()

        self.num_spkrs = num_spkrs
Exemplo n.º 8
0
    def __init__(self,
                 etype,
                 idim,
                 elayers_sd,
                 elayers_rec,
                 eunits,
                 eprojs,
                 subsample,
                 dropout,
                 num_spkrs=2,
                 in_channel=1):
        super(Encoder, self).__init__()
        typ = etype.lstrip("vgg").lstrip("b").rstrip("p")
        if typ != "lstm" and typ != "gru":
            logging.error(
                "Error: need to specify an appropriate encoder architecture")
        if etype.startswith("vgg"):
            if etype[-1] == "p":
                self.enc_mix = torch.nn.ModuleList([VGG2L(in_channel)])
                self.enc_sd = torch.nn.ModuleList([
                    torch.nn.ModuleList([
                        RNNP(get_vgg2l_odim(idim, in_channel=in_channel),
                             elayers_sd,
                             eunits,
                             eprojs,
                             subsample[:elayers_sd + 1],
                             dropout,
                             typ=typ)
                    ]) for i in range(num_spkrs)
                ])
                self.enc_rec = torch.nn.ModuleList([
                    RNNP(eprojs,
                         elayers_rec,
                         eunits,
                         eprojs,
                         subsample[elayers_sd:],
                         dropout,
                         typ=typ)
                ])
                logging.info('Use CNN-VGG + B' + typ.upper() + 'P for encoder')
        else:
            logging.error(
                "Error: need to specify an appropriate encoder architecture")
            sys.exit()

        self.num_spkrs = num_spkrs
Exemplo n.º 9
0
    def __init__(
        self,
        idim: int,
        etype: str,
        elayers: int,
        eunits: int,
        eprojs: int,
        subsample: np.ndarray,
        dropout_rate: float = 0.0,
        aux_enc_output_layers: List = [],
    ):
        """Initialize Encoder module."""
        super(Encoder, self).__init__()

        rnn_type = etype.lstrip("vgg").rstrip("p")
        in_channel = 1

        if etype.startswith("vgg"):
            if etype[-1] == "p":
                self.enc = torch.nn.ModuleList([
                    VGG2L(in_channel),
                    RNNP(
                        get_vgg2l_odim(idim, in_channel=in_channel),
                        rnn_type,
                        elayers,
                        eunits,
                        eprojs,
                        subsample,
                        dropout_rate=dropout_rate,
                        aux_output_layers=aux_enc_output_layers,
                    ),
                ])
            else:
                self.enc = torch.nn.ModuleList([
                    VGG2L(in_channel),
                    RNN(
                        get_vgg2l_odim(idim, in_channel=in_channel),
                        rnn_type,
                        elayers,
                        eunits,
                        eprojs,
                        dropout_rate=dropout_rate,
                        aux_output_layers=aux_enc_output_layers,
                    ),
                ])
            self.conv_subsampling_factor = 4
        else:
            if etype[-1] == "p":
                self.enc = torch.nn.ModuleList([
                    RNNP(
                        idim,
                        rnn_type,
                        elayers,
                        eunits,
                        eprojs,
                        subsample,
                        dropout_rate=dropout_rate,
                        aux_output_layers=aux_enc_output_layers,
                    )
                ])
            else:
                self.enc = torch.nn.ModuleList([
                    RNN(
                        idim,
                        rnn_type,
                        elayers,
                        eunits,
                        eprojs,
                        dropout_rate=dropout_rate,
                        aux_output_layers=aux_enc_output_layers,
                    )
                ])
            self.conv_subsampling_factor = 1
Exemplo n.º 10
0
    def __init__(
        self,
        etype: str,
        idim: int,
        elayers: int,
        eunits: int,
        eprojs: int,
        subsample: np.ndarray,
        dropout: float,
        in_channel: int = 1,
        aux_task_layer_list: List = [],
    ):
        """Initialize Encoder module."""
        super(Encoder, self).__init__()

        typ = etype.lstrip("vgg").rstrip("p")
        if typ not in ["lstm", "gru", "blstm", "bgru"]:
            logging.error(
                "Error: need to specify an appropriate encoder architecture")

        if etype.startswith("vgg"):
            if etype[-1] == "p":
                self.enc = torch.nn.ModuleList([
                    VGG2L(in_channel),
                    RNNP(
                        get_vgg2l_odim(idim, in_channel=in_channel),
                        elayers,
                        eunits,
                        eprojs,
                        subsample,
                        dropout,
                        typ=typ,
                        aux_task_layer_list=aux_task_layer_list,
                    ),
                ])
                logging.info("Use CNN-VGG + " + typ.upper() + "P for encoder")
            else:
                self.enc = torch.nn.ModuleList([
                    VGG2L(in_channel),
                    RNN(
                        get_vgg2l_odim(idim, in_channel=in_channel),
                        elayers,
                        eunits,
                        eprojs,
                        dropout,
                        typ=typ,
                        aux_task_layer_list=aux_task_layer_list,
                    ),
                ])
                logging.info("Use CNN-VGG + " + typ.upper() + " for encoder")
            self.conv_subsampling_factor = 4
        else:
            if etype[-1] == "p":
                self.enc = torch.nn.ModuleList([
                    RNNP(
                        idim,
                        elayers,
                        eunits,
                        eprojs,
                        subsample,
                        dropout,
                        typ=typ,
                        aux_task_layer_list=aux_task_layer_list,
                    )
                ])
                logging.info(typ.upper() +
                             " with every-layer projection for encoder")
            else:
                self.enc = torch.nn.ModuleList([
                    RNN(
                        idim,
                        elayers,
                        eunits,
                        eprojs,
                        dropout,
                        typ=typ,
                        aux_task_layer_list=aux_task_layer_list,
                    )
                ])
                logging.info(typ.upper() + " without projection for encoder")
            self.conv_subsampling_factor = 1
Exemplo n.º 11
0
    def __init__(self,
                 etype,
                 idim,
                 elayers,
                 eunits,
                 eprojs,
                 subsample,
                 dropout,
                 in_channel=1,
                 apply_bn=False):
        super(Encoder, self).__init__()
        typ = etype.lstrip("vgg").lstrip("cnn").rstrip("p")
        if typ not in ["lstm", "gru", "blstm", "bgru"]:
            logging.error(
                "Error: need to specify an appropriate encoder architecture")

        if etype.startswith("vgg"):
            conv_layers = VGG2L(in_channel)
            out_channel = 128
        elif etype.startswith("cnn"):
            conv_layers = Conv2L(in_channel, idim, apply_bn)
            out_channel = 32
        else:
            conv_layers = None
        if conv_layers:
            if etype[-1] == "p":
                self.enc = torch.nn.ModuleList([
                    conv_layers,
                    RNNP(
                        get_vgg2l_odim(idim,
                                       in_channel=in_channel,
                                       out_channel=out_channel),
                        elayers,
                        eunits,
                        eprojs,
                        subsample,
                        dropout,
                        typ=typ,
                        apply_bn=apply_bn,
                    ),
                ])
                logging.info("Use CNN-VGG + " + typ.upper() + "P for encoder")
            else:
                self.enc = torch.nn.ModuleList([
                    conv_layers,
                    RNN(
                        get_vgg2l_odim(idim,
                                       in_channel=in_channel,
                                       out_channel=out_channel),
                        elayers,
                        eunits,
                        eprojs,
                        dropout,
                        typ=typ,
                    ),
                ])
                logging.info("Use CNN-VGG + " + typ.upper() + " for encoder")
            self.conv_subsampling_factor = 4
        else:
            if etype[-1] == "p":
                self.enc = torch.nn.ModuleList([
                    RNNP(idim,
                         elayers,
                         eunits,
                         eprojs,
                         subsample,
                         dropout,
                         typ=typ,
                         apply_bn=apply_bn)
                ])
                logging.info(typ.upper() +
                             " with every-layer projection for encoder")
            else:
                self.enc = torch.nn.ModuleList(
                    [RNN(idim, elayers, eunits, eprojs, dropout, typ=typ)])
                logging.info(typ.upper() + " without projection for encoder")
            self.conv_subsampling_factor = 1