Пример #1
0
    def __init__(self, config, input_emb_size, mix_speech_len, tgt_vocab_size, use_cuda, pretrain=None, score_fn=''):
        super(seq2seq, self).__init__()
        if pretrain is not None:
            src_embedding = pretrain['src_emb']
            tgt_embedding = pretrain['tgt_emb']
        else:
            src_embedding = None
            tgt_embedding = None
        self.encoder = models.rnn_encoder(config, input_emb_size, None, embedding=src_embedding)
        if config.shared_vocab == False:
            self.decoder = models.rnn_decoder(config, tgt_vocab_size, embedding=tgt_embedding, score_fn=score_fn)
        else:
            self.decoder = models.rnn_decoder(config, tgt_vocab_size, embedding=self.encoder.embedding,
                                              score_fn=score_fn)
        self.use_cuda = use_cuda
        self.tgt_vocab_size = tgt_vocab_size
        self.config = config
        self.criterion = models.criterion(tgt_vocab_size, use_cuda,config.loss)
        self.loss_for_ss = nn.MSELoss()
        self.log_softmax = nn.LogSoftmax()
        self.wav_loss = models.WaveLoss(dBscale=1, nfft=config.FRAME_LENGTH, hop_size=config.FRAME_SHIFT)

        speech_fre = input_emb_size
        num_labels = tgt_vocab_size
        if config.use_tas:
            self.ss_model = models.ConvTasNet()
        else:
            self.ss_model = models.SS(config, speech_fre, mix_speech_len, num_labels)
Пример #2
0
    def __init__(self,
                 config,
                 input_emb_size,
                 mix_speech_len,
                 tgt_vocab_size,
                 use_cuda,
                 pretrain=None,
                 score_fn=''):
        super(seq2seq, self).__init__()
        if pretrain is not None:
            src_embedding = pretrain['src_emb']
            tgt_embedding = pretrain['tgt_emb']
        else:
            src_embedding = None
            tgt_embedding = None
        # self.encoder = models.rnn_encoder(config, input_emb_size, None, embedding=src_embedding)
        if config.is_two_channel:
            self.encoder = models.TransEncoder(config, 2 * input_emb_size)
        else:
            self.encoder = models.TransEncoder(config, input_emb_size)
        self.decoder = models.TransDecoder(config,
                                           sos_id=0,
                                           eos_id=tgt_vocab_size - 1,
                                           n_tgt_vocab=tgt_vocab_size)
        # if config.shared_vocab == False:
        #     self.decoder = models.rnn_decoder(config, tgt_vocab_size, embedding=tgt_embedding, score_fn=score_fn)
        # else:
        #     self.decoder = models.rnn_decoder(config, tgt_vocab_size, embedding=self.encoder.embedding,
        #                                       score_fn=score_fn)
        self.use_cuda = use_cuda
        self.tgt_vocab_size = tgt_vocab_size
        self.config = config
        self.criterion = models.criterion(tgt_vocab_size, use_cuda,
                                          config.loss)
        self.loss_for_ss = nn.MSELoss()
        self.log_softmax = nn.LogSoftmax()
        self.wav_loss = models.WaveLoss(dBscale=1,
                                        nfft=config.FRAME_LENGTH,
                                        hop_size=config.FRAME_SHIFT)

        speech_fre = input_emb_size
        num_labels = tgt_vocab_size
        if config.is_two_channel:
            self.separation_linear_real = nn.Linear(self.encoder.d_model,
                                                    2 * speech_fre)
            self.separation_linear_imag = nn.Linear(self.encoder.d_model,
                                                    2 * speech_fre)
        else:
            self.separation_linear = nn.Linear(self.encoder.d_model,
                                               2 * speech_fre)
        self.speech_fre = speech_fre
        self.dropout_layer = nn.Dropout(config.linear_dropout)
Пример #3
0
    def __init__(self, config, input_emb_size, mix_speech_len, tgt_vocab_size, use_cuda, pretrain=None, score_fn=''):
        super(seq2seq_music, self).__init__()
        if pretrain is not None:
            src_embedding = pretrain['src_emb']
            tgt_embedding = pretrain['tgt_emb']
        else:
            src_embedding = None
            tgt_embedding = None
        self.use_cuda = use_cuda
        self.tgt_vocab_size = tgt_vocab_size
        self.config = config
        self.criterion = models.criterion(tgt_vocab_size, use_cuda,config.loss)
        self.loss_for_ss = nn.MSELoss()
        self.log_softmax = nn.LogSoftmax()
        self.wav_loss = models.WaveLoss(dBscale=1, nfft=config.FRAME_LENGTH, hop_size=config.FRAME_SHIFT)

        speech_fre = input_emb_size
        num_labels = tgt_vocab_size
        if config.use_tas:
            if self.config.use_dprnn:
                self.ss_model = models.FaSNet_base(config)
                self.spk_lstm = nn.LSTMCell(self.ss_model.B + self.ss_model.N, self.ss_model.B)  # LSTM over the speakers' step.
            else:
                self.ss_model = models.ConvTasNet_music(config)
                if self.config.two_stage:
                    self.second_ss_model = models.ConvTasNet_2nd(config)
                    for p in self.encoder.parameters():
                        p.requires_grad = False
                    for p in self.decoder.parameters():
                        p.requires_grad = False
                    for p in self.ss_model.parameters():
                        p.requires_grad = False
                self.spk_lstm = nn.LSTMCell(self.ss_model.B + self.ss_model.N, self.ss_model.B)  # LSTM over the speakers' step.
        else:
            # self.ss_model = models.SS_att(config, speech_fre, mix_speech_len, num_labels)
            self.ss_model = models.SS(config, speech_fre, mix_speech_len, num_labels)
Пример #4
0
    def __init__(self,
                 config,
                 input_emb_size,
                 mix_speech_len,
                 tgt_vocab_size,
                 use_cuda,
                 pretrain=None,
                 score_fn=''):
        super(seq2seq, self).__init__()
        if pretrain is not None:
            src_embedding = pretrain['src_emb']
            tgt_embedding = pretrain['tgt_emb']
        else:
            src_embedding = None
            tgt_embedding = None
        # self.encoder = models.rnn_encoder(config, input_emb_size, None, embedding=src_embedding)
        self.encoder = models.TransEncoder(config, input_emb_size)
        self.decoder = models.TransDecoder(config,
                                           sos_id=0,
                                           eos_id=tgt_vocab_size - 1,
                                           n_tgt_vocab=tgt_vocab_size)
        # if config.shared_vocab == False:
        #     self.decoder = models.rnn_decoder(config, tgt_vocab_size, embedding=tgt_embedding, score_fn=score_fn)
        # else:
        #     self.decoder = models.rnn_decoder(config, tgt_vocab_size, embedding=self.encoder.embedding,
        #                                       score_fn=score_fn)
        self.use_cuda = use_cuda
        self.tgt_vocab_size = tgt_vocab_size
        self.config = config
        self.criterion = models.criterion(tgt_vocab_size, use_cuda,
                                          config.loss)
        self.loss_for_ss = nn.MSELoss()
        self.log_softmax = nn.LogSoftmax()
        self.wav_loss = models.WaveLoss(dBscale=1,
                                        nfft=config.FRAME_LENGTH,
                                        hop_size=config.FRAME_SHIFT)

        speech_fre = input_emb_size
        num_labels = tgt_vocab_size
        if config.use_tas:
            if self.config.use_dprnn:
                self.ss_model = models.FaSNet_base(config)
                if self.config.two_stage:
                    self.second_ss_model = models.FaSNet_base_2nd(config)
                    for p in self.encoder.parameters():
                        p.requires_grad = False
                    for p in self.decoder.parameters():
                        p.requires_grad = False
                    for p in self.ss_model.parameters():
                        p.requires_grad = False
            else:
                self.ss_model = models.ConvTasNet(config)
                if self.config.two_stage:
                    self.second_ss_model = models.ConvTasNet_2nd(config)
                    for p in self.encoder.parameters():
                        p.requires_grad = False
                    for p in self.decoder.parameters():
                        p.requires_grad = False
                    for p in self.ss_model.parameters():
                        p.requires_grad = False
        else:
            # self.ss_model = models.SS_att(config, speech_fre, mix_speech_len, num_labels)
            self.ss_model = models.SS(config, speech_fre, mix_speech_len,
                                      num_labels)