예제 #1
0
 def __init__(self, speech_encoder, config):
     super(SpeechTranscriber, self).__init__()
     self.config = config
     self.SpeechEncoderBottom = speech_encoder
     self.SpeechEncoderTop = SpeechEncoderTop(**config['SpeechEncoderTop'])
     self.TextDecoder = DecoderWithAttn(**config['TextDecoder'])
     self.optimizer = optim.Adam(self.parameters(), lr=config['lr'])
예제 #2
0
 def __init__(self, speech_encoder, config):
     super(SpeechCorrText, self).__init__()
     self.config = config
     self.SpeechEncoderBottom = speech_encoder
     self.SpeechEncoderTop = SpeechEncoderTop(**config['SpeechEncoderTop'])
     self.CorrEncoder = ImageEncoder(size=1024, size_target=1024 * 4)
     self.optimizer = optim.Adam(self.parameters(), lr=config['lr'])
예제 #3
0
 def __init__(self, speech_encoder, config):
     super(SpeechImage, self).__init__()
     self.config = config
     self.SpeechEncoderBottom = speech_encoder
     self.SpeechEncoderTop = SpeechEncoderTop(**config['SpeechEncoderTop'])
     self.ImageEncoder = ImageEncoder(**config['ImageEncoder'])
     self.optimizer = optim.Adam(self.parameters(), lr=config['lr'])
예제 #4
0
class SpeechTranscriber(nn.Module):
    def __init__(self, speech_encoder, config):
        super(SpeechTranscriber, self).__init__()
        self.config = config
        self.SpeechEncoderBottom = speech_encoder
        self.SpeechEncoderTop = SpeechEncoderTop(**config['SpeechEncoderTop'])
        self.TextDecoder = DecoderWithAttn(**config['TextDecoder'])
        self.optimizer = optim.Adam(self.parameters(), lr=config['lr'])

    def cost(self, speech, target, target_prev):
        states, rep = self.SpeechEncoderTop.states(
            self.SpeechEncoderBottom(speech))
        target_logits = self.TextDecoder(states, rep, target_prev)
        cost = F.cross_entropy(
            target_logits.view(
                target_logits.size(0) * target_logits.size(1), -1),
            target.view(target.size(0) * target.size(1)))
        return cost

    def args(self, item):
        return (item['audio'], item['target_t'].astype('int64'),
                item['target_prev_t'].astype('int64'))

    def test_cost(self, *args):
        with testing(self):
            return self.cost(*args)
예제 #5
0
 def __init__(self, speech_encoder, text_encoder, config):
     super(SpeechText, self).__init__()
     self.config = config
     self.SpeechEncoderBottom = speech_encoder
     self.TextEncoderBottom = text_encoder
     self.SpeechEncoderTop = SpeechEncoderTop(**config['SpeechEncoderTop'])
     self.TextEncoderTop = TextEncoderTop(**config['TextEncoderTop'])
     self.optimizer = optim.Adam(self.parameters(), lr=config['lr'])