help='choices of neural network')
    args = parser.parse_args()

    train_transform = Compose(
        [LoadMagSpectrogram(),
         ComputeMelSpectrogramFromMagSpectrogram()])
    if args.dataset == 'librispeech':
        from datasets.libri_speech import LibriSpeech, vocab
        train_dataset = ConcatDataset([
            LibriSpeech(name='train-clean-100', transform=train_transform),
            LibriSpeech(name='train-clean-360', transform=train_transform),
            LibriSpeech(name='train-other-500', transform=train_transform)
        ])
    else:
        from datasets.bolor_speech import BolorSpeech, vocab
        train_dataset = BolorSpeech(name='train', transform=train_transform)

    directory = Path(args.input)
    files = [f for f in directory.iterdir() if f.suffix == ".pth"]
    assert (len(files) > 1)

    def load_model(f):
        if args.model == 'jasper':
            model = TinyJasper(vocab)
        elif args.model == 'w2l':
            model = TinyWav2Letter(vocab)
        else:
            model = Speech2TextCRNN(vocab)
        checkpoint = torch.load(f)
        model.load_state_dict(checkpoint['state_dict'])
        model.float()
    dataset = MBSpeech()
elif args.dataset == 'librispeech':
    from datasets.libri_speech import LibriSpeech
    dataset = ConcatDataset([
        LibriSpeech(name='train-clean-100'),
        LibriSpeech(name='train-clean-360'),
        LibriSpeech(name='train-other-500'),
        LibriSpeech(name='dev-clean',)
    ])
elif args.dataset == 'backgroundsounds':
    from datasets.background_sounds import BackgroundSounds
    dataset = BackgroundSounds(is_random=False)
elif args.dataset == 'bolorspeech':
    from datasets.bolor_speech import BolorSpeech
    dataset = ConcatDataset([
        BolorSpeech(name='train'),
        BolorSpeech(name='train2'),
        BolorSpeech(name='test'),
        BolorSpeech(name='demo'),
        BolorSpeech(name='annotation'),
        BolorSpeech(name='annotation-1111')
    ])
else:
    print("unknown dataset!")
    import sys
    sys.exit(1)


transform=Compose([LoadAudio(), ComputeMagSpectrogram()])
for data in tqdm(dataset):
    fname = data['fname']
예제 #3
0
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--dataset",
                    choices=['librispeech', 'mbspeech', 'bolorspeech'],
                    default='bolorspeech',
                    help='dataset name')
args = parser.parse_args()

if args.dataset == 'mbspeech':
    from datasets.mb_speech import MBSpeech
    dataset = MBSpeech()
elif args.dataset == 'librispeech':
    from datasets.libri_speech import LibriSpeech
    dataset = ConcatDataset([
        LibriSpeech(name='train-clean-100'),
        LibriSpeech(name='train-clean-360'),
        LibriSpeech(name='train-other-500'),
        LibriSpeech(name='dev-clean', )
    ])
else:
    from datasets.bolor_speech import BolorSpeech
    dataset = ConcatDataset(
        [BolorSpeech(name='train'),
         BolorSpeech(name='test')])

transform = Compose([LoadAudio(), ComputeMagSpectrogram()])
for data in tqdm(dataset):
    fname = data['fname']
    data = transform(data)
    mel_spectrogram = data['input']
    np.save(fname.replace('.wav', '.npy'), mel_spectrogram)