num_features = 64 eps = 2**-24 if args.model == 'crnn': # CRNN supports only 32 features num_features = 32 eps = 1e-20 duration = 5.0 # s sample_rate = 16000 print("recording %0.1fs audio..." % duration) recorded_audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, blocking=True) print("recorded, replaying it before doing speech recognition...") sd.play(recorded_audio, samplerate=sample_rate, blocking=True) transform = Compose([ ComputeMagSpectrogram(), ComputeMelSpectrogramFromMagSpectrogram(num_features=num_features, normalize=args.normalize, eps=eps) ]) transcribe( transform({ 'samples': recorded_audio, 'sample_rate': sample_rate, 'text': '' }), num_features, args)
LibriSpeech(name='train-clean-100'), LibriSpeech(name='train-clean-360'), LibriSpeech(name='train-other-500'), LibriSpeech(name='dev-clean',) ]) elif args.dataset == 'backgroundsounds': from datasets.background_sounds import BackgroundSounds dataset = BackgroundSounds(is_random=False) elif args.dataset == 'bolorspeech': from datasets.bolor_speech import BolorSpeech dataset = ConcatDataset([ BolorSpeech(name='train'), BolorSpeech(name='train2'), BolorSpeech(name='test'), BolorSpeech(name='demo'), BolorSpeech(name='annotation'), BolorSpeech(name='annotation-1111') ]) else: print("unknown dataset!") import sys sys.exit(1) transform=Compose([LoadAudio(), ComputeMagSpectrogram()]) for data in tqdm(dataset): fname = data['fname'] data = transform(data) mel_spectrogram = data['input'] np.save(fname.replace('.wav', '.npy'), mel_spectrogram)
default=0.0003, help='learning rate for optimization') args = parser.parse_args() use_gpu = torch.cuda.is_available() print('use_gpu', use_gpu) if use_gpu: torch.backends.cudnn.benchmark = True if args.dataset == 'librispeech': from datasets.libri_speech import LibriSpeech as SpeechDataset, vocab, idx2char else: from datasets.mb_speech import MBSpeech as SpeechDataset, vocab, idx2char train_dataset = SpeechDataset( transform=Compose([LoadAudio( ), SpeedChange(), ExtractSpeechFeatures()])) valid_dataset = SpeechDataset( transform=Compose([LoadAudio(), ExtractSpeechFeatures()])) indices = list(range(len(train_dataset))) train_sampler = SubsetRandomSampler(indices[:-args.batch_size]) valid_sampler = SubsetRandomSampler(indices[-args.batch_size:]) train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn, num_workers=args.dataload_workers_nums, sampler=train_sampler) valid_data_loader = DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False,
type=str, required=False, help='link to KenLM 5-gram binary language model') parser.add_argument("--alpha", type=float, default=0.3, help='alpha for CTC decode') parser.add_argument("--beta", type=float, default=1.85, help='beta for CTC decode') args = parser.parse_args() duration = 5.0 #s sample_rate = 16000 print("recording %0.1fs audio..." % duration) recorded_audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, blocking=True) print("recorded, replaying it before doing speech recognition...") sd.play(recorded_audio, samplerate=sample_rate, blocking=True) data = {'samples': recorded_audio, 'sample_rate': sample_rate, 'text': ''} data = Compose([ExtractSpeechFeatures()])(data) result = transcribe(data, args) print("Predicted:") print(result)
alpha=args.alpha, beta=args.beta, cutoff_top_n=40, cutoff_prob=1.0, beam_width=1000) else: decoder = GreedyDecoder(labels=vocab) t = time.time() decoded_output, _ = decoder.decode(outputs) print("decode time: %.3fs" % (time.time() - t)) return decoded_output[0][0] if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--checkpoint", type=str, required=True, help='checkpoint file to test') parser.add_argument("--lm", type=str, required=False, help='link to KenLM 5-gram binary language model') parser.add_argument("--alpha", type=float, default=0.3, help='alpha for CTC decode') parser.add_argument("--beta", type=float, default=1.85, help='beta for CTC decode') parser.add_argument("audio", help='a WAV file') args = parser.parse_args() data = { 'fname': args.audio, 'text': '' } data = Compose([LoadAudio(), ExtractSpeechFeatures()])(data) result = transcribe(data, args) print("Predicted:") print(result)
from datasets import Compose, ComputeMelSpectrogram from transcribe import transcribe if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--checkpoint", type=str, required=True, help='checkpoint file to test') parser.add_argument("--model", choices=['jasper', 'w2l', 'crnn'], default='w2l', help='choices of neural network') parser.add_argument("--lm", type=str, required=False, help='link to KenLM 5-gram binary language model') parser.add_argument("--alpha", type=float, default=0.3, help='alpha for CTC decode') parser.add_argument("--beta", type=float, default=1.85, help='beta for CTC decode') args = parser.parse_args() duration = 5.0 # s sample_rate = 16000 print("recording %0.1fs audio..." % duration) recorded_audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, blocking=True) print("recorded, replaying it before doing speech recognition...") sd.play(recorded_audio, samplerate=sample_rate, blocking=True) data = { 'samples': recorded_audio, 'sample_rate': sample_rate, 'text': '' } data = Compose([ComputeMelSpectrogram()])(data) transcribe(data, args)
def Train(train_root, train_csv, test_csv): # parameters args = parse_args() record_params(args) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_order torch.manual_seed(args.torch_seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(args.torch_seed) np.random.seed(args.torch_seed) random.seed(args.torch_seed) if args.cudnn == 0: cudnn.benchmark = False else: cudnn.benchmark = True cudnn.deterministic = True device = torch.device("cuda" if torch.cuda.is_available() else "cpu") num_classes = 4 net = build_model(args.model_name, num_classes, args.pretrain) # resume checkpoint_name_loss = os.path.join( args.checkpoint, args.params_name.split('.')[0] + '_loss.' + args.params_name.split('.')[-1]) checkpoint_name_acc = os.path.join( args.checkpoint, args.params_name.split('.')[0] + '_acc.' + args.params_name.split('.')[-1]) if args.resume != 0: logging.info('Resuming from checkpoint...') checkpoint = torch.load(checkpoint_name_loss) best_loss = checkpoint['loss'] best_acc = checkpoint['acc'] start_epoch = checkpoint['epoch'] history = checkpoint['history'] net.load_state_dict(checkpoint['net']) else: best_loss = float('inf') best_acc = 0.0 start_epoch = 0 history = { 'train_loss': [], 'train_acc': [], 'test_loss': [], 'test_acc': [] } end_epoch = start_epoch + args.num_epoch if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") net = nn.DataParallel(net) net.to(device) # data img_size = args.img_size ## train train_aug = Compose([ Resize(size=(img_size, img_size)), RandomHorizontallyFlip(), RandomVerticallyFlip(), RandomRotate(90), ToTensor(), Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) ]) ## test # test_aug = train_aug test_aug = Compose([ Resize(size=(img_size, img_size)), ToTensor(), Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) ]) train_dataset = breast_classify_inbreast(root=train_root, csv_file=train_csv, transform=train_aug) test_dataset = breast_classify_inbreast(root=train_root, csv_file=test_csv, transform=test_aug) if args.weighted_sampling == 1: weights = torch.FloatTensor([1.0, 1.0, 1.5, 5.0]).to(device) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=4, shuffle=True) else: weights = None train_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=4, shuffle=True) # train_loader = DataLoader(train_dataset, batch_size=args.batch_size, # num_workers=4, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=4, shuffle=True) # loss function, optimizer and scheduler criterion = nn.NLLLoss(size_average=True, weight=weights).to(device) optimizer = Adam(net.parameters(), lr=args.lr, amsgrad=True) ## scheduler if args.lr_policy == 'StepLR': scheduler = StepLR(optimizer, step_size=30, gamma=0.5) if args.lr_policy == 'PolyLR': scheduler = PolyLR(optimizer, max_epoch=end_epoch, power=0.9) # training process logging.info('Start Training For Breast Density Classification') for epoch in range(start_epoch, end_epoch): ts = time.time() if args.lr_policy != 'None': scheduler.step() # train net.train() train_loss = 0. train_acc = 0. for batch_idx, (inputs, targets) in tqdm(enumerate(train_loader), total=int(len(train_loader))): inputs = inputs.to(device) targets = targets.to(device) targets = targets.long() optimizer.zero_grad() outputs = net(inputs) loss = criterion(F.log_softmax(outputs, dim=1), targets) loss.backward() optimizer.step() train_loss += loss.item() accuracy = float(sum(outputs.argmax(dim=1) == targets)) train_acc += accuracy train_acc_epoch = train_acc / (len(train_loader.dataset)) train_loss_epoch = train_loss / (batch_idx + 1) history['train_loss'].append(train_loss_epoch) history['train_acc'].append(train_acc_epoch) # test net.eval() test_loss = 0. test_acc = 0. for batch_idx, (inputs, targets) in tqdm( enumerate(test_loader), total=int(len(test_loader.dataset) / args.batch_size) + 1): with torch.no_grad(): inputs = inputs.to(device) targets = targets.to(device) targets = targets.long() outputs = net(inputs) loss = criterion(F.log_softmax(outputs, dim=1), targets) accuracy = float(sum(outputs.argmax(dim=1) == targets)) test_acc += accuracy test_loss += loss.item() test_loss_epoch = test_loss / (batch_idx + 1) test_acc_epoch = test_acc / (len(test_loader.dataset)) history['test_loss'].append(test_loss_epoch) history['test_acc'].append(test_acc_epoch) time_cost = time.time() - ts logging.info( 'epoch[%d/%d]: train_loss: %.3f | train_acc: %.3f | test_loss: %.3f | test_acc: %.3f || time: %.1f' % (epoch + 1, end_epoch, train_loss_epoch, train_acc_epoch, test_loss_epoch, test_acc_epoch, time_cost)) # save checkpoint if test_loss_epoch < best_loss: logging.info('Loss checkpoint Saving...') save_model = net if torch.cuda.device_count() > 1: save_model = list(net.children())[0] state = { 'net': save_model.state_dict(), 'loss': test_loss_epoch, 'acc': test_acc_epoch, 'epoch': epoch + 1, 'history': history } torch.save(state, checkpoint_name_loss) best_loss = test_loss_epoch if test_acc_epoch > best_acc: logging.info('Acc checkpoint Saving...') save_model = net if torch.cuda.device_count() > 1: save_model = list(net.children())[0] state = { 'net': save_model.state_dict(), 'loss': test_loss_epoch, 'acc': test_acc_epoch, 'epoch': epoch + 1, 'history': history } torch.save(state, checkpoint_name_acc) best_acc = test_acc_epoch
def Train(train_root, train_csv, test_csv): # parameters args = parse_args() # record record_params(args) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_order torch.manual_seed(args.torch_seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(args.torch_seed) np.random.seed(args.torch_seed) random.seed(args.torch_seed) if args.cudnn == 0: cudnn.benchmark = False else: cudnn.benchmark = True cudnn.deterministic = True device = torch.device("cuda" if torch.cuda.is_available() else "cpu") num_classes = 2 net = build_model(args.model_name, num_classes) params_name = '{}_r{}.pkl'.format(args.model_name, args.repetition) start_epoch = 0 history = { 'train_loss': [], 'test_loss': [], 'train_dice': [], 'test_dice': [] } end_epoch = start_epoch + args.num_epoch if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") net = nn.DataParallel(net) net.to(device) # data train_aug = Compose([ Resize(size=(args.img_size, args.img_size)), ToTensor(), Normalize(mean=args.data_mean, std=args.data_std) ]) test_aug = Compose([ Resize(size=(args.img_size, args.img_size)), ToTensor(), Normalize(mean=args.data_mean, std=args.data_std) ]) train_dataset = breast_seg(root=train_root, csv_file=train_csv, transform=train_aug) test_dataset = breast_seg(root=train_root, csv_file=test_csv, transform=test_aug) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=4, shuffle=True, drop_last=True) test_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=4, shuffle=False) # loss function, optimizer and scheduler cedice_weight = torch.tensor(args.cedice_weight) ceclass_weight = torch.tensor(args.ceclass_weight) diceclass_weight = torch.tensor(args.diceclass_weight) if args.loss == 'ce': criterion = CrossEntropyLoss2d(weight=ceclass_weight).to(device) elif args.loss == 'dice': criterion = MulticlassDiceLoss(weight=diceclass_weight).to(device) elif args.loss == 'cedice': criterion = CEMDiceLoss(cediceweight=cedice_weight, ceclassweight=ceclass_weight, diceclassweight=diceclass_weight).to(device) else: print('Do not have this loss') optimizer = Adam(net.parameters(), lr=args.lr, amsgrad=True) ## scheduler if args.lr_policy == 'StepLR': scheduler = StepLR(optimizer, step_size=30, gamma=0.5) if args.lr_policy == 'PolyLR': scheduler = PolyLR(optimizer, max_epoch=end_epoch, power=0.9) # training process logging.info('Start Training For Breast Seg') besttraindice = 0. for epoch in range(start_epoch, end_epoch): ts = time.time() net.train() for batch_idx, (imgs, _, targets) in tqdm( enumerate(train_loader), total=int(len(train_loader.dataset) / args.batch_size)): imgs = imgs.to(device) targets = targets.to(device) optimizer.zero_grad() outputs = net(imgs) loss = criterion(outputs, targets) loss.backward() optimizer.step() # test net.eval() test_loss = 0. test_dice = 0. test_count = 0 for batch_idx, (imgs, _, targets) in tqdm( enumerate(test_loader), total=int(len(test_loader.dataset) / args.batch_size)): with torch.no_grad(): imgs = imgs.to(device) targets = targets.to(device) outputs = net(imgs) loss = criterion(outputs, targets).mean() test_count += imgs.shape[0] test_loss += loss.item() * imgs.shape[0] test_dice += Dice_fn(outputs, targets).item() test_loss_epoch = test_loss / float(test_count) test_dice_epoch = test_dice / float(test_count) history['test_loss'].append(test_loss_epoch) history['test_dice'].append(test_dice_epoch) train_loss = 0. train_dice = 0. train_count = 0 for batch_idx, (imgs, _, targets) in tqdm( enumerate(train_loader), total=int(len(train_loader.dataset) / args.batch_size)): with torch.no_grad(): imgs = imgs.to(device) targets = targets.to(device) outputs = net(imgs) loss = criterion(outputs, targets).mean() train_count += imgs.shape[0] train_loss += loss.item() * imgs.shape[0] train_dice += Dice_fn(outputs, targets).item() train_loss_epoch = train_loss / float(train_count) train_dice_epoch = train_dice / float(train_count) history['train_loss'].append(train_loss_epoch) history['train_dice'].append(train_dice_epoch) time_cost = time.time() - ts logging.info( 'epoch[%d/%d]: train_loss: %.3f | test_loss: %.3f | train_dice: %.3f | test_dice: %.3f || time: %.1f' % (epoch + 1, end_epoch, train_loss_epoch, test_loss_epoch, train_dice_epoch, test_dice_epoch, time_cost)) if args.lr_policy != 'None': scheduler.step() # save checkpoint if train_dice_epoch > besttraindice: besttraindice = train_dice_epoch logging.info('Besttraindice Checkpoint {} Saving...'.format(epoch + 1)) save_model = net if torch.cuda.device_count() > 1: save_model = list(net.children())[0] state = { 'net': save_model.state_dict(), 'loss': test_loss_epoch, 'dice': test_dice_epoch, 'epoch': epoch + 1, 'history': history } savecheckname = os.path.join( args.checkpoint, params_name.split('.pkl')[0] + '_besttraindice.' + params_name.split('.')[-1]) torch.save(state, savecheckname)