def load_model(path): package = torch.load(path, map_location=lambda storage, loc: storage) if package['name'] == 'deepspeech': from deepspeech.model import DeepSpeech model = DeepSpeech(package['model_conf'], package['audio_conf'], package['labels']) else: raise NotImplementedError model.load_state_dict(package['state_dict']) return model, package
def load_model(device, model_path, use_half): model = DeepSpeech.load_model(model_path) model.eval() model = model.to(device) if use_half: model = model.half() return model
def __init__(self, continue_from=None, sample_rate=None, window_size=None, window_stride=None, window=None, noise_dir=None, noise_prob=None, noise_min=None, noise_max=None, labels_path=None, hidden_size=None, hidden_layers=None, bidirectional=None, rnn_type=None): if continue_from: # Starting from previous model print("Loading checkpoint model %s" % continue_from) package = torch.load(continue_from, map_location=lambda storage, loc: storage) model = DeepSpeech.load_model_package(package) else: with open(labels_path) as label_file: labels = str(''.join(json.load(label_file))) audio_conf = dict(sample_rate=sample_rate, window_size=window_size, window_stride=window_stride, window=window, noise_dir=noise_dir, noise_prob=noise_prob, noise_levels=(noise_min, noise_max)) rnn_type = rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size=hidden_size, nb_layers=hidden_layers, labels=labels, rnn_type=supported_rnns[rnn_type], audio_conf=audio_conf, bidirectional=bidirectional) self._model = model
parser.add_argument('--lm-num-alphas', default=45, type=float, help='Number of alpha candidates for tuning') parser.add_argument('--lm-num-betas', default=8, type=float, help='Number of beta candidates for tuning') parser = add_decoder_args(parser) args = parser.parse_args() if args.lm_path is None: print("error: LM must be provided for tuning") sys.exit(1) model = DeepSpeech.load_model(args.model_path) saved_output = np.load(args.saved_output) def init(beam_width, blank_index, lm_path): global decoder decoder = BeamCTCDecoder(model.labels, lm_path=lm_path, beam_width=beam_width, num_processes=args.lm_workers, blank_index=blank_index) def decode_dataset(params): lm_alpha, lm_beta = params
audio_conf = model.audio_conf model_conf = model.model_conf if not args.finetune: # Don't want to restart training optim_state = package['optim_dict'] start_epoch = int(package.get( 'epoch', 0)) + 1 # Index start at 0 for training train_loss = int(package.get('avg_loss', 0)) for i in range(start_epoch): train_results[i] = package['train_results'][i] val_results[i] = package['val_results'][i] best_wer = float(val_results[:start_epoch].min()) else: with open(args.labels_path) as label_file: labels = json.load(label_file) model = Model(model_conf, audio_conf, labels) model = model.to(device) # Data inputs configuration train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels) val_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, labels=labels) label_decoder = LabelDecoder(labels) if not args.distributed: train_sampler = BucketingSampler(train_dataset, batch_size=batch_size) val_sampler = BucketingSampler(val_dataset, batch_size=batch_size)
audio_conf = model.audio_conf model_conf = model.model_conf if not args.finetune: # Don't want to restart training optim_state = package['optim_dict'] start_epoch = int(package.get( 'epoch', 0)) + 1 # Index start at 0 for training train_loss = int(package.get('avg_loss', 0)) for i in range(start_epoch): train_results[i] = package['train_results'][i] val_results[i] = package['val_results'][i] best_wer = float(val_results[:start_epoch].min()) else: with open(args.labels_path) as label_file: labels = json.load(label_file) model = Model(model_conf, audio_conf, labels) model = model.to(device) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels) val_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, labels=labels) label_decoder = LabelDecoder(labels) if not args.distributed: train_sampler = BucketingSampler(train_dataset, batch_size=batch_size) val_sampler = BucketingSampler(val_dataset, batch_size=batch_size) else: train_sampler = DistributedBucketingSampler(
def __init__( self, model=None, seed=None, cuda=None, id=None, epochs=None, batch_size=None, save_folder=None, lr=None, momentum=None, opt_level=None, keep_batchnorm_fp32=None, loss_scale=None, train_manifest=None, val_manifest=None, no_sorta_grad=None, num_workers=None, no_shuffle=None, max_norm=None, checkpoint_per_batch=None, model_path=None, learning_anneal=None, checkpoint=None, augment=None, continue_from=None, ): self.debug = Debugger(name=id, trace_dir="/tmp") # Set seeds for determinism torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) np.random.seed(seed) random.seed(seed) labels = model.labels audio_conf = model.audio_conf if continue_from not in [""]: # Don't want to restart training package = torch.load(continue_from, map_location=lambda storage, loc: storage) self.optim_state = package['optim_dict'] self.start_epoch = int(package.get( 'epoch', 1)) - 1 # Index start at 0 for training self.start_iter = package.get('iteration', None) if self.start_iter is None: self.start_epoch += 1 # We saved model after epoch finished, start at the next epoch. start_iter = 0 else: self.start_iter += 1 self.avg_loss = int(package.get('avg_loss', 0)) self.loss_results, self.cer_results, self.wer_results = package['loss_results'], \ package['cer_results'], \ package['wer_results'] self.best_cer = self.cer_results[self.start_epoch] self.device = torch.device("cuda" if cuda else "cpu") device = torch.device("cuda" if cuda else "cpu") save_folder = save_folder os.makedirs(save_folder, exist_ok=True) # Ensure save folder exists self.loss_results, self.cer_results, self.wer_results = torch.Tensor(epochs), \ torch.Tensor(epochs), \ torch.Tensor(epochs) best_cer = None self.avg_loss, self.start_epoch, self.start_iter, self.optim_state = 0, 0, 0, None decoder = GreedyDecoder(labels) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=train_manifest, labels=labels, normalize=True, augment=augment) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=val_manifest, labels=labels, normalize=True, augment=False) train_sampler = BucketingSampler(train_dataset, batch_size=batch_size) train_loader = AudioDataLoader(train_dataset, num_workers=num_workers, batch_sampler=train_sampler) test_loader = AudioDataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers) if (not no_shuffle and self.start_epoch != 0) or no_sorta_grad: print("Shuffling batches for the following epochs") train_sampler.shuffle(self.start_epoch) model = model.to(device) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=lr, momentum=momentum, nesterov=True, weight_decay=1e-5) if self.optim_state is not None: optimizer.load_state_dict(self.optim_state) model, optimizer = amp.initialize( model, optimizer, opt_level=opt_level, keep_batchnorm_fp32=keep_batchnorm_fp32, loss_scale=loss_scale) print(model) print("Number of parameters: %d" % DeepSpeech.get_param_size(model)) self.optimizer = optimizer self.lr = lr self.learning_anneal = learning_anneal self.no_shuffle = no_shuffle self.train_sampler = train_sampler self.model = model self.model_path = model_path self.best_cer = best_cer self.checkpoint = checkpoint self.save_folder = save_folder self.test_loader = test_loader self.device = device self.decoder = decoder self.checkpoint_per_batch = checkpoint_per_batch self.max_norm = max_norm self.train_loader = train_loader self.last_update = 0 self.epoch = 0 self.epochs = 0 self.time_epoch = 0 self.losses_decoder = { "best": { "wer": np.float("inf"), "cer": np.float("inf") }, "avg": { "wer": np.float("inf"), "cer": np.float("inf") } } self.losses = { "history": { "train": [], "dev": [] }, "avg": { "train": np.float("inf"), "dev": np.float("inf") }, "best": { "dev": np.float("inf") } }
def run(self, epochs, start_epoch=0, early_stop=1): criterion = CTCLoss() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() self.last_update = start_epoch self.epochs = epochs for self.epoch in range(start_epoch, epochs): try: t0 = time.time() self.model.train() for i, (data) in tqdm(enumerate(self.train_loader, start=self.start_iter), desc=self.get_description(), total=len(self.train_loader)): if i == len(self.train_sampler): break inputs, targets, input_percentages, target_sizes = data # print(inputs) input_sizes = input_percentages.mul_(int( inputs.size(3))).int() # measure data loading time inputs = inputs.to(self.device) out, output_sizes = self.model(inputs, input_sizes) out = out.transpose(0, 1) # TxNxH float_out = out.float() # ensure float32 for loss loss = criterion(float_out, targets, output_sizes, target_sizes).to(self.device) loss = loss / inputs.size( 0) # average the loss by minibatch loss_value = loss.item() # Check to ensure valid loss was calculated valid_loss, error = check_loss(loss, loss_value) if valid_loss: self.optimizer.zero_grad() # compute gradient with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_norm) self.optimizer.step() else: print(error) print('Skipping grad update') loss_value = 0 self.avg_loss += loss_value losses.update(loss_value, inputs.size(0)) # measure elapsed time if self.checkpoint_per_batch > 0 and i > 0 and ( i + 1) % self.checkpoint_per_batch == 0: file_path = '%s/deepspeech_checkpoint_epoch_%d_iter_%d.pth' % ( self.save_folder, self.epoch + 1, i + 1) print("Saving checkpoint model to %s" % file_path) torch.save( DeepSpeech.serialize( self.model, optimizer=self.optimizer, epoch=self.epoch, iteration=i, loss_results=self.loss_results, wer_results=self.wer_results, cer_results=self.cer_results, avg_loss=self.avg_loss), file_path) del loss, out, float_out self.avg_loss /= len(self.train_sampler) self.losses["avg"]["train"] = self.avg_loss self.start_iter = 0 # Reset start iteration for next epoch with torch.no_grad(): wer, cer, output_data = evaluate( test_loader=self.test_loader, device=self.device, model=self.model, decoder=self.decoder, target_decoder=self.decoder, verbose_once=True) self.loss_results[self.epoch] = self.avg_loss self.cer_results[self.epoch] = cer self.losses_decoder["avg"]["cer"] = cer for g in self.optimizer.param_groups: g['lr'] = g['lr'] / self.learning_anneal self.lr = g['lr'] if self.best_cer is None or self.best_cer > cer: torch.save( DeepSpeech.serialize(self.model, optimizer=self.optimizer, epoch=self.epoch, loss_results=self.loss_results, wer_results=self.wer_results, cer_results=self.cer_results), self.model_path) self.best_cer = cer self.last_update = self.epoch self.losses_decoder["best"]["cer"] = cer self.avg_loss = 0 if not self.no_shuffle: # print("Shuffling batches...") self.train_sampler.shuffle(self.epoch) self.time_epoch = int(time.time() - t0) assert self.epoch - self.last_update < early_stop except AssertionError: print("Early stop") return
import argparse import numpy as np from deepspeech.model import DeepSpeech from deepspeech.data.data_loader import SpectrogramParser from noswear.model import load_model parser = argparse.ArgumentParser() parser.add_argument('audio_file', type=argparse.FileType('r'), help='File to classify') args = parser.parse_args() base_model = DeepSpeech.load_model('models/librispeech_pretrained.pth') audio_conf = DeepSpeech.get_audio_conf(base_model) parser = SpectrogramParser(audio_conf, normalize=True) net = load_model(base_model, {'f_pickle': 'models/binary_clf.pkl'}) print(net) fpath = args.audio_file.name audio = parser.parse_audio(fpath) X = {'lens': np.array([audio.shape[1]]), 'X': np.array(audio)[None]} y_pred = net.predict(X) print(y_pred[0] and 'swear! :(' or 'noswear :)')