help='choices of neural network') args = parser.parse_args() train_transform = Compose( [LoadMagSpectrogram(), ComputeMelSpectrogramFromMagSpectrogram()]) if args.dataset == 'librispeech': from datasets.libri_speech import LibriSpeech, vocab train_dataset = ConcatDataset([ LibriSpeech(name='train-clean-100', transform=train_transform), LibriSpeech(name='train-clean-360', transform=train_transform), LibriSpeech(name='train-other-500', transform=train_transform) ]) else: from datasets.bolor_speech import BolorSpeech, vocab train_dataset = BolorSpeech(name='train', transform=train_transform) directory = Path(args.input) files = [f for f in directory.iterdir() if f.suffix == ".pth"] assert (len(files) > 1) def load_model(f): if args.model == 'jasper': model = TinyJasper(vocab) elif args.model == 'w2l': model = TinyWav2Letter(vocab) else: model = Speech2TextCRNN(vocab) checkpoint = torch.load(f) model.load_state_dict(checkpoint['state_dict']) model.float()
dataset = MBSpeech() elif args.dataset == 'librispeech': from datasets.libri_speech import LibriSpeech dataset = ConcatDataset([ LibriSpeech(name='train-clean-100'), LibriSpeech(name='train-clean-360'), LibriSpeech(name='train-other-500'), LibriSpeech(name='dev-clean',) ]) elif args.dataset == 'backgroundsounds': from datasets.background_sounds import BackgroundSounds dataset = BackgroundSounds(is_random=False) elif args.dataset == 'bolorspeech': from datasets.bolor_speech import BolorSpeech dataset = ConcatDataset([ BolorSpeech(name='train'), BolorSpeech(name='train2'), BolorSpeech(name='test'), BolorSpeech(name='demo'), BolorSpeech(name='annotation'), BolorSpeech(name='annotation-1111') ]) else: print("unknown dataset!") import sys sys.exit(1) transform=Compose([LoadAudio(), ComputeMagSpectrogram()]) for data in tqdm(dataset): fname = data['fname']
formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--dataset", choices=['librispeech', 'mbspeech', 'bolorspeech'], default='bolorspeech', help='dataset name') args = parser.parse_args() if args.dataset == 'mbspeech': from datasets.mb_speech import MBSpeech dataset = MBSpeech() elif args.dataset == 'librispeech': from datasets.libri_speech import LibriSpeech dataset = ConcatDataset([ LibriSpeech(name='train-clean-100'), LibriSpeech(name='train-clean-360'), LibriSpeech(name='train-other-500'), LibriSpeech(name='dev-clean', ) ]) else: from datasets.bolor_speech import BolorSpeech dataset = ConcatDataset( [BolorSpeech(name='train'), BolorSpeech(name='test')]) transform = Compose([LoadAudio(), ComputeMagSpectrogram()]) for data in tqdm(dataset): fname = data['fname'] data = transform(data) mel_spectrogram = data['input'] np.save(fname.replace('.wav', '.npy'), mel_spectrogram)