Exemplo n.º 1
0
    def __init__(self, FLAGS):
        self.FLAGS = FLAGS
        logdir = os.path.join('logs', FLAGS.name)

        self.tokenizer = HuggingFaceTokenizer(
            cache_dir=logdir, vocab_size=FLAGS.bpe_size)

        _, self.transform, input_size = build_transform(
            feature_type=FLAGS.feature, feature_size=FLAGS.feature_size,
            n_fft=FLAGS.n_fft, win_length=FLAGS.win_length,
            hop_length=FLAGS.hop_length, delta=FLAGS.delta, cmvn=FLAGS.cmvn,
            downsample=FLAGS.downsample, pad_to_divisible=False,
            T_mask=FLAGS.T_mask, T_num_mask=FLAGS.T_num_mask,
            F_mask=FLAGS.F_mask, F_num_mask=FLAGS.F_num_mask)

        ie = IECore()
        encoder_net = ie.read_network(
            model=os.path.join(logdir, 'encoder.xml'),
            weights=os.path.join(logdir, 'encoder.bin'))
        self.encoder = ie.load_network(network=encoder_net, device_name='CPU')

        decoder_net = ie.read_network(
            model=os.path.join(logdir, 'decoder.xml'),
            weights=os.path.join(logdir, 'decoder.bin'))
        self.decoder = ie.load_network(network=decoder_net, device_name='CPU')

        joint_net = ie.read_network(
            model=os.path.join(logdir, 'joint.xml'),
            weights=os.path.join(logdir, 'joint.bin'))
        self.joint = ie.load_network(network=joint_net, device_name='CPU')

        self.reset_profile()
        self.reset()
Exemplo n.º 2
0
    def val_dataloader(self):
        _, transform_test, _ = build_transform(feature_type=FLAGS.feature,
                                               feature_size=FLAGS.feature_size,
                                               n_fft=FLAGS.n_fft,
                                               win_length=FLAGS.win_length,
                                               hop_length=FLAGS.hop_length,
                                               delta=FLAGS.delta,
                                               cmvn=FLAGS.cmvn,
                                               downsample=FLAGS.downsample,
                                               T_mask=FLAGS.T_mask,
                                               T_num_mask=FLAGS.T_num_mask,
                                               F_mask=FLAGS.F_mask,
                                               F_num_mask=FLAGS.F_num_mask)

        val_dataloader = DataLoader(dataset=MergedDataset([
            Librispeech(root=FLAGS.LibriSpeech_test,
                        tokenizer=self.tokenizer,
                        transform=transform_test,
                        reverse_sorted_by_length=True)
        ]),
                                    batch_size=FLAGS.eval_batch_size,
                                    shuffle=False,
                                    num_workers=FLAGS.num_workers,
                                    collate_fn=seq_collate)
        return val_dataloader
Exemplo n.º 3
0
def load_openvino_model():
    logdir = os.path.join('logs', FLAGS.name)

    tokenizer = HuggingFaceTokenizer(cache_dir=logdir,
                                     vocab_size=FLAGS.bpe_size)

    _, transform, input_size = build_transform(feature_type=FLAGS.feature,
                                               feature_size=FLAGS.feature_size,
                                               n_fft=FLAGS.n_fft,
                                               win_length=FLAGS.win_length,
                                               hop_length=FLAGS.hop_length,
                                               delta=FLAGS.delta,
                                               cmvn=FLAGS.cmvn,
                                               downsample=FLAGS.downsample,
                                               pad_to_divisible=False,
                                               T_mask=FLAGS.T_mask,
                                               T_num_mask=FLAGS.T_num_mask,
                                               F_mask=FLAGS.F_mask,
                                               F_num_mask=FLAGS.F_num_mask)

    ie = IECore()
    encoder_net = ie.read_network(model=os.path.join(logdir, 'encoder.xml'),
                                  weights=os.path.join(logdir, 'encoder.bin'))
    encoder = ie.load_network(network=encoder_net, device_name='CPU')

    decoder_net = ie.read_network(model=os.path.join(logdir, 'decoder.xml'),
                                  weights=os.path.join(logdir, 'decoder.bin'))
    decoder = ie.load_network(network=decoder_net, device_name='CPU')

    joint_net = ie.read_network(model=os.path.join(logdir, 'joint.xml'),
                                weights=os.path.join(logdir, 'joint.bin'))
    joint = ie.load_network(network=joint_net, device_name='CPU')

    return encoder, decoder, joint, tokenizer, transform
Exemplo n.º 4
0
    def __init__(self, FLAGS):
        self.FLAGS = FLAGS
        logdir = os.path.join('logs', FLAGS.name)

        self.tokenizer = HuggingFaceTokenizer(
            cache_dir='BPE-'+str(FLAGS.bpe_size), vocab_size=FLAGS.bpe_size)
        
        assert self.tokenizer.tokenizer != None

        _, self.transform, input_size = build_transform(
            feature_type=FLAGS.feature, feature_size=FLAGS.feature_size,
            n_fft=FLAGS.n_fft, win_length=FLAGS.win_length,
            hop_length=FLAGS.hop_length, delta=FLAGS.delta, cmvn=FLAGS.cmvn,
            downsample=FLAGS.downsample, pad_to_divisible=False,
            T_mask=FLAGS.T_mask, T_num_mask=FLAGS.T_num_mask,
            F_mask=FLAGS.F_mask, F_num_mask=FLAGS.F_num_mask)

        model_path = os.path.join(logdir, 'models', FLAGS.model_name)
        if os.path.exists(model_path):
            checkpoint = torch.load(model_path, lambda storage, loc: storage)
        else:
            model_path = os.path.join(logdir, FLAGS.model_name)
            checkpoint = torch.load(model_path, lambda storage, loc: storage)

        transducer = Transducer(
            vocab_embed_size=FLAGS.vocab_embed_size,
            vocab_size=self.tokenizer.vocab_size,
            input_size=input_size,
            enc_hidden_size=FLAGS.enc_hidden_size,
            enc_layers=FLAGS.enc_layers,
            enc_dropout=FLAGS.enc_dropout,
            enc_proj_size=FLAGS.enc_proj_size,
            dec_hidden_size=FLAGS.dec_hidden_size,
            dec_layers=FLAGS.dec_layers,
            dec_dropout=FLAGS.dec_dropout,
            dec_proj_size=FLAGS.dec_proj_size,
            joint_size=FLAGS.joint_size,
            output_loss=False,
        )

        transducer.load_state_dict(convert_lightning2normal(checkpoint)['model'])
        transducer.eval()
        self.encoder = transducer.encoder
        self.decoder = transducer.decoder
        self.joint = transducer.joint

        self.reset_profile()
        self.reset()
Exemplo n.º 5
0
    def __init__(self):
        super(ParallelTraining, self).__init__()
        _, _, input_size = build_transform(feature_type=FLAGS.feature,
                                           feature_size=FLAGS.feature_size,
                                           n_fft=FLAGS.n_fft,
                                           win_length=FLAGS.win_length,
                                           hop_length=FLAGS.hop_length,
                                           delta=FLAGS.delta,
                                           cmvn=FLAGS.cmvn,
                                           downsample=FLAGS.downsample,
                                           T_mask=FLAGS.T_mask,
                                           T_num_mask=FLAGS.T_num_mask,
                                           F_mask=FLAGS.F_mask,
                                           F_num_mask=FLAGS.F_num_mask)
        self.log_path = None
        self.loss_fn = RNNTLoss(blank=NUL)

        if FLAGS.tokenizer == 'char':
            self.tokenizer = CharTokenizer(cache_dir=self.logdir)
        else:
            self.tokenizer = HuggingFaceTokenizer(cache_dir='BPE-2048',
                                                  vocab_size=FLAGS.bpe_size)
        self.vocab_size = self.tokenizer.vocab_size
        print(FLAGS.enc_type)

        self.model = Transducer(
            vocab_embed_size=FLAGS.vocab_embed_size,
            vocab_size=self.vocab_size,
            input_size=input_size,
            enc_hidden_size=FLAGS.enc_hidden_size,
            enc_layers=FLAGS.enc_layers,
            enc_dropout=FLAGS.enc_dropout,
            enc_proj_size=FLAGS.enc_proj_size,
            dec_hidden_size=FLAGS.dec_hidden_size,
            dec_layers=FLAGS.dec_layers,
            dec_dropout=FLAGS.dec_dropout,
            dec_proj_size=FLAGS.dec_proj_size,
            joint_size=FLAGS.joint_size,
            module_type=FLAGS.enc_type,
            output_loss=False,
        )
        self.latest_alignment = None
        self.steps = 0
        self.epoch = 0
        self.best_wer = 1000
Exemplo n.º 6
0
def main(argv):
    assert FLAGS.step_n_frame % 2 == 0, ("step_n_frame must be divisible by "
                                         "reduction_factor of TimeReduction")

    logdir = os.path.join('logs', FLAGS.name)

    tokenizer = HuggingFaceTokenizer(cache_dir=logdir,
                                     vocab_size=FLAGS.bpe_size)

    transform_train, transform_test, input_size = build_transform(
        feature_type=FLAGS.feature,
        feature_size=FLAGS.feature_size,
        n_fft=FLAGS.n_fft,
        win_length=FLAGS.win_length,
        hop_length=FLAGS.hop_length,
        delta=FLAGS.delta,
        cmvn=FLAGS.cmvn,
        downsample=FLAGS.downsample,
        T_mask=FLAGS.T_mask,
        T_num_mask=FLAGS.T_num_mask,
        F_mask=FLAGS.F_mask,
        F_num_mask=FLAGS.F_num_mask)

    model_path = os.path.join(logdir, 'models', FLAGS.model_name)
    checkpoint = torch.load(model_path, lambda storage, loc: storage)
    transducer = Transducer(
        vocab_embed_size=FLAGS.vocab_embed_size,
        vocab_size=tokenizer.vocab_size,
        input_size=input_size,
        enc_hidden_size=FLAGS.enc_hidden_size,
        enc_layers=FLAGS.enc_layers,
        enc_dropout=FLAGS.enc_dropout,
        enc_proj_size=FLAGS.enc_proj_size,
        dec_hidden_size=FLAGS.dec_hidden_size,
        dec_layers=FLAGS.dec_layers,
        dec_dropout=FLAGS.dec_dropout,
        dec_proj_size=FLAGS.dec_proj_size,
        joint_size=FLAGS.joint_size,
    )
    transducer.load_state_dict(checkpoint['model'])
    transducer.eval()

    export_encoder(transducer, input_size, tokenizer.vocab_size, logdir)
    export_decoder(transducer, input_size, tokenizer.vocab_size, logdir)
    export_join(transducer, input_size, tokenizer.vocab_size, logdir)
Exemplo n.º 7
0
def load_pytorch_model():
    logdir = os.path.join('logs', FLAGS.name)

    tokenizer = HuggingFaceTokenizer(cache_dir=logdir,
                                     vocab_size=FLAGS.bpe_size)

    _, transform, input_size = build_transform(feature_type=FLAGS.feature,
                                               feature_size=FLAGS.feature_size,
                                               n_fft=FLAGS.n_fft,
                                               win_length=FLAGS.win_length,
                                               hop_length=FLAGS.hop_length,
                                               delta=FLAGS.delta,
                                               cmvn=FLAGS.cmvn,
                                               downsample=FLAGS.downsample,
                                               pad_to_divisible=False,
                                               T_mask=FLAGS.T_mask,
                                               T_num_mask=FLAGS.T_num_mask,
                                               F_mask=FLAGS.F_mask,
                                               F_num_mask=FLAGS.F_num_mask)

    model_path = os.path.join(logdir, 'models', '%d.pt' % FLAGS.step)
    checkpoint = torch.load(model_path, lambda storage, loc: storage)
    transducer = Transducer(
        vocab_embed_size=FLAGS.vocab_embed_size,
        vocab_size=tokenizer.vocab_size,
        input_size=input_size,
        enc_hidden_size=FLAGS.enc_hidden_size,
        enc_layers=FLAGS.enc_layers,
        enc_dropout=FLAGS.enc_dropout,
        enc_proj_size=FLAGS.enc_proj_size,
        dec_hidden_size=FLAGS.dec_hidden_size,
        dec_layers=FLAGS.dec_layers,
        dec_dropout=FLAGS.dec_dropout,
        dec_proj_size=FLAGS.dec_proj_size,
        joint_size=FLAGS.joint_size,
    )
    transducer.load_state_dict(checkpoint['model'])
    transducer.eval()
    encoder = transducer.encoder
    decoder = transducer.decoder
    joint = transducer.joint
    return encoder, decoder, joint, tokenizer, transform
Exemplo n.º 8
0
    def train_dataloader(self):
        transform_train, _, _ = build_transform(
            feature_type=FLAGS.feature,
            feature_size=FLAGS.feature_size,
            n_fft=FLAGS.n_fft,
            win_length=FLAGS.win_length,
            hop_length=FLAGS.hop_length,
            delta=FLAGS.delta,
            cmvn=FLAGS.cmvn,
            downsample=FLAGS.downsample,
            T_mask=FLAGS.T_mask,
            T_num_mask=FLAGS.T_num_mask,
            F_mask=FLAGS.F_mask,
            F_num_mask=FLAGS.F_num_mask)

        dataloader = DataLoader(
            dataset=MergedDataset([
                Librispeech(root=FLAGS.LibriSpeech_train_500,
                            tokenizer=self.tokenizer,
                            transform=transform_train,
                            audio_max_length=FLAGS.audio_max_length),
                Librispeech(root=FLAGS.LibriSpeech_train_360,
                            tokenizer=self.tokenizer,
                            transform=transform_train,
                            audio_max_length=FLAGS.audio_max_length),
                # Librispeech(
                #     root=FLAGS.LibriSpeech_train_100,
                #     tokenizer=self.tokenizer,
                #     transform=transform_train,
                #     audio_max_length=FLAGS.audio_max_length),
                TEDLIUM(root=FLAGS.TEDLIUM_train,
                        tokenizer=self.tokenizer,
                        transform=transform_train,
                        audio_max_length=FLAGS.audio_max_length),
                CommonVoice(root=FLAGS.CommonVoice,
                            labels='train.tsv',
                            tokenizer=self.tokenizer,
                            transform=transform_train,
                            audio_max_length=FLAGS.audio_max_length,
                            audio_min_length=1),
                YoutubeCaption(root='../speech_data/youtube-speech-text/',
                               labels='bloomberg2_meta.csv',
                               tokenizer=self.tokenizer,
                               transform=transform_train,
                               audio_max_length=FLAGS.audio_max_length,
                               audio_min_length=1),
                YoutubeCaption(root='../speech_data/youtube-speech-text/',
                               labels='life_meta.csv',
                               tokenizer=self.tokenizer,
                               transform=transform_train,
                               audio_max_length=FLAGS.audio_max_length,
                               audio_min_length=1),
                YoutubeCaption(root='../speech_data/youtube-speech-text/',
                               labels='news_meta.csv',
                               tokenizer=self.tokenizer,
                               transform=transform_train,
                               audio_max_length=FLAGS.audio_max_length,
                               audio_min_length=1),
                YoutubeCaption(root='../speech_data/youtube-speech-text/',
                               labels='english2_meta.csv',
                               tokenizer=self.tokenizer,
                               transform=transform_train,
                               audio_max_length=FLAGS.audio_max_length,
                               audio_min_length=1),
            ]),
            batch_size=FLAGS.sub_batch_size,
            shuffle=True,
            num_workers=FLAGS.num_workers,
            collate_fn=seq_collate,
            drop_last=True)
        return dataloader
Exemplo n.º 9
0
    def __init__(self):
        self.name = FLAGS.name
        self.logdir = os.path.join('logs', FLAGS.name)
        self.model_dir = os.path.join(self.logdir, 'models')

        # Transform
        transform_train, transform_test, input_size = build_transform(
            feature_type=FLAGS.feature,
            feature_size=FLAGS.feature_size,
            n_fft=FLAGS.n_fft,
            win_length=FLAGS.win_length,
            hop_length=FLAGS.hop_length,
            delta=FLAGS.delta,
            cmvn=FLAGS.cmvn,
            downsample=FLAGS.downsample,
            T_mask=FLAGS.T_mask,
            T_num_mask=FLAGS.T_num_mask,
            F_mask=FLAGS.F_mask,
            F_num_mask=FLAGS.F_num_mask)

        # Tokenizer
        if FLAGS.tokenizer == 'char':
            self.tokenizer = CharTokenizer(cache_dir=self.logdir)
        else:
            self.tokenizer = HuggingFaceTokenizer(cache_dir=self.logdir,
                                                  vocab_size=FLAGS.bpe_size)

        # Dataloader
        self.dataloader_train = DataLoader(
            dataset=MergedDataset([
                Librispeech(root=FLAGS.LibriSpeech_train_500,
                            tokenizer=self.tokenizer,
                            transform=transform_train,
                            audio_max_length=FLAGS.audio_max_length),
                Librispeech(root=FLAGS.LibriSpeech_train_360,
                            tokenizer=self.tokenizer,
                            transform=transform_train,
                            audio_max_length=FLAGS.audio_max_length),
                Librispeech(root=FLAGS.LibriSpeech_train_100,
                            tokenizer=self.tokenizer,
                            transform=transform_train,
                            audio_max_length=FLAGS.audio_max_length),
                # TEDLIUM(
                #     root=FLAGS.TEDLIUM_train,
                #     tokenizer=self.tokenizer,
                #     transform=transform_train,
                #     audio_max_length=FLAGS.audio_max_length),
                # CommonVoice(
                #     root=FLAGS.CommonVoice, labels='train.tsv',
                #     tokenizer=self.tokenizer,
                #     transform=transform_train,
                #     audio_max_length=FLAGS.audio_max_length)
            ]),
            batch_size=FLAGS.batch_size,
            shuffle=True,
            num_workers=FLAGS.num_workers,
            collate_fn=seq_collate,
            drop_last=True)

        self.dataloader_val = DataLoader(dataset=MergedDataset([
            Librispeech(root=FLAGS.LibriSpeech_test,
                        tokenizer=self.tokenizer,
                        transform=transform_test,
                        reverse_sorted_by_length=True)
        ]),
                                         batch_size=FLAGS.eval_batch_size,
                                         shuffle=False,
                                         num_workers=FLAGS.num_workers,
                                         collate_fn=seq_collate)

        self.tokenizer.build(self.dataloader_train.dataset.texts())
        self.vocab_size = self.dataloader_train.dataset.tokenizer.vocab_size

        # Model
        self.model = Transducer(
            vocab_embed_size=FLAGS.vocab_embed_size,
            vocab_size=self.vocab_size,
            input_size=input_size,
            enc_hidden_size=FLAGS.enc_hidden_size,
            enc_layers=FLAGS.enc_layers,
            enc_dropout=FLAGS.enc_dropout,
            enc_proj_size=FLAGS.enc_proj_size,
            dec_hidden_size=FLAGS.dec_hidden_size,
            dec_layers=FLAGS.dec_layers,
            dec_dropout=FLAGS.dec_dropout,
            dec_proj_size=FLAGS.dec_proj_size,
            joint_size=FLAGS.joint_size,
        ).to(device)

        # Optimizer
        if FLAGS.optim == 'adam':
            self.optim = optim.Adam(self.model.parameters(), lr=FLAGS.lr)
        else:
            self.optim = optim.SGD(self.model.parameters(),
                                   lr=FLAGS.lr,
                                   momentum=0.9)
        # Scheduler
        if FLAGS.sched:
            self.sched = optim.lr_scheduler.ReduceLROnPlateau(
                self.optim,
                patience=FLAGS.sched_patience,
                factor=FLAGS.sched_factor,
                min_lr=FLAGS.sched_min_lr,
                verbose=1)
        # Apex
        if FLAGS.apex:
            self.model, self.optim = amp.initialize(self.model,
                                                    self.optim,
                                                    opt_level=FLAGS.opt_level)
        # Multi GPU
        if FLAGS.multi_gpu:
            self.model = torch.nn.DataParallel(self.model)