def run_transcribe(audio_path: str, spect_parser: SpectrogramParser, model: DeepSpeech, decoder: Decoder, device: torch.device, use_half: bool): # audio_path # try: # # inTranscript = audio_path.replace("wav", "txt") # # print(inTranscript) # # getTranscript(inTranscript) # pass # except Exception as asd: # print(asd) # pass spect = spect_parser.parse_audio(audio_path).contiguous() spect = spect.view(1, 1, spect.size(0), spect.size(1)) spect = spect.to(device) if use_half: spect = spect.half() input_sizes = torch.IntTensor([spect.size(3)]).int() out, output_sizes = model(spect, input_sizes) decoded_output, decoded_offsets = decoder.decode(out, output_sizes) #Thêm vào greedy decoder2 = GreedyDecoder(labels=model.labels, blank_index=model.labels.index('_')) decoded_output2, decoded_offsets2 = decoder2.decode(out, output_sizes) return decoded_output, decoded_output2, decoded_offsets, decoded_offsets2
def __init__(self, labels: List, model_cfg: Union[UniDirectionalConfig, BiDirectionalConfig, ConvolutionConfig], precision: int, optim_cfg: Union[AdamConfig, SGDConfig], spect_cfg: SpectConfig): super().__init__() self.save_hyperparameters() self.model_cfg = model_cfg self.precision = precision self.optim_cfg = optim_cfg self.spect_cfg = spect_cfg self.convolutional = True if OmegaConf.get_type( model_cfg) is ConvolutionConfig else False self.bidirectional = True if OmegaConf.get_type( model_cfg) is BiDirectionalConfig else False self.labels = labels self.conv = MaskConv( nn.Sequential( nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5)), nn.BatchNorm2d(32), nn.Hardtanh(0, 20, inplace=True), nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5)), nn.BatchNorm2d(32), nn.Hardtanh(0, 20, inplace=True))) # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1 rnn_input_size = int( math.floor((self.spect_cfg.sample_rate * self.spect_cfg.window_size) / 2) + 1) rnn_input_size = int(math.floor(rnn_input_size + 2 * 20 - 41) / 2 + 1) rnn_input_size = int(math.floor(rnn_input_size + 2 * 10 - 21) / 2 + 1) rnn_input_size *= 32 if self.convolutional is False: self.rnns, self.lookahead, self.fc = self._rnn_construct( rnn_input_size) else: self.deep_conv, self.fc = self._conv_construct(rnn_input_size) self.inference_softmax = InferenceBatchSoftmax() self.criterion = CTCLoss(blank=self.labels.index('_'), reduction='sum', zero_infinity=True) self.evaluation_decoder = GreedyDecoder( self.labels) # Decoder used for validation self.wer = WordErrorRate(decoder=self.evaluation_decoder, target_decoder=self.evaluation_decoder) self.cer = CharErrorRate(decoder=self.evaluation_decoder, target_decoder=self.evaluation_decoder)
def evaluate(cfg: EvalConfig): device = torch.device("cuda" if cfg.model.cuda else "cpu") model = load_model(device=device, model_path=cfg.model.model_path) decoder = load_decoder(labels=model.labels, cfg=cfg.lm) target_decoder = GreedyDecoder(labels=model.labels, blank_index=model.labels.index('_')) test_dataset = SpectrogramDataset(audio_conf=model.spect_cfg, input_path=hydra.utils.to_absolute_path( cfg.test_path), labels=model.labels, normalize=True) test_loader = AudioDataLoader(test_dataset, batch_size=cfg.batch_size, num_workers=cfg.num_workers) wer, cer = run_evaluation_print(test_loader=test_loader, device=device, model=model, decoder=decoder, target_decoder=target_decoder, precision=cfg.model.precision) print('Test Summary \t' 'Average WER {wer:.3f}\t' 'Average CER {cer:.3f}\t'.format(wer=wer, cer=cer))
def __init__(self, cfg): self.cfg = cfg self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.model = load_model( self.device, hydra.utils.to_absolute_path(self.cfg.model_path)) self.ckpt = torch.load(hydra.utils.to_absolute_path( self.cfg.model_path), map_location=self.device) self.labels = self.ckpt['hyper_parameters']['labels'] self.decoder = BeamCTCDecoder(labels=self.labels, lm_path=hydra.utils.to_absolute_path( self.cfg.lm_path), beam_width=self.cfg.beam_width, num_processes=self.cfg.num_workers, blank_index=self.labels.index('_')) self.target_decoder = GreedyDecoder(labels=self.labels, blank_index=self.labels.index('_')) test_dataset = SpectrogramDataset( audio_conf=self.cfg.spect_cfg, input_path=hydra.utils.to_absolute_path(cfg.test_path), labels=self.labels, normalize=True) self.test_loader = AudioDataLoader(test_dataset, batch_size=self.cfg.batch_size, num_workers=self.cfg.num_workers)
def evaluate(cfg: EvalConfig): device = torch.device("cuda" if cfg.model.cuda else "cpu") model = load_model(device=device, model_path=cfg.model.model_path, use_half=cfg.model.use_half) decoder = load_decoder(labels=model.labels, cfg=cfg.lm) target_decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) test_dataset = SpectrogramDataset( audio_conf=model.audio_conf, manifest_filepath=hydra.utils.to_absolute_path(cfg.test_manifest), labels=model.labels, normalize=True) test_loader = AudioDataLoader(test_dataset, batch_size=cfg.batch_size, num_workers=cfg.num_workers) wer, cer, output_data = run_evaluation(test_loader=test_loader, device=device, model=model, decoder=decoder, target_decoder=target_decoder, save_output=cfg.save_output, verbose=cfg.verbose, use_half=cfg.model.use_half) print('Test Summary \t' 'Average WER {wer:.3f}\t' 'Average CER {cer:.3f}\t'.format(wer=wer, cer=cer)) if cfg.save_output: torch.save(output_data, hydra.utils.to_absolute_path(cfg.save_output))
def load_decoder(decoder_type, labels, lm_path, alpha, beta, cutoff_top_n, cutoff_prob, beam_width, lm_workers): if decoder_type == "beam": from deepspeech_pytorch.decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels=labels, lm_path=lm_path, alpha=alpha, beta=beta, cutoff_top_n=cutoff_top_n, cutoff_prob=cutoff_prob, beam_width=beam_width, num_processes=lm_workers) else: decoder = GreedyDecoder(labels=labels, blank_index=labels.index('_')) return decoder
def load_decoder(labels, cfg: LMConfig): if cfg.decoder_type == DecoderType.beam: from deepspeech_pytorch.decoder import BeamCTCDecoder if cfg.lm_path: cfg.lm_path = hydra.utils.to_absolute_path(cfg.lm_path) decoder = BeamCTCDecoder(labels=labels, lm_path=cfg.lm_path, alpha=cfg.alpha, beta=cfg.beta, cutoff_top_n=cfg.cutoff_top_n, cutoff_prob=cfg.cutoff_prob, beam_width=cfg.beam_width, num_processes=cfg.lm_workers) else: decoder = GreedyDecoder(labels=labels, blank_index=labels.index('_')) return decoder
def main(): import argparse global model, spect_parser, decoder, args, device parser = argparse.ArgumentParser( description='DeepSpeech transcription server') parser.add_argument('--host', type=str, default='0.0.0.0', help='Host to be used by the server') parser.add_argument('--port', type=int, default=8888, help='Port to be used by the server') parser = add_inference_args(parser) parser = add_decoder_args(parser) args = parser.parse_args() logging.getLogger().setLevel(logging.DEBUG) logging.info('Setting up server...') torch.set_grad_enabled(False) device = torch.device("cuda" if args.cuda else "cpu") model = load_model(device, args.model_path, args.half) if args.decoder == "beam": from deepspeech_pytorch.decoder import BeamCTCDecoder decoder = BeamCTCDecoder(model.labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) else: decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) spect_parser = SpectrogramParser(model.audio_conf, normalize=True) logging.info('Server initialised') app.run(host=args.host, port=args.port, debug=True, use_reloader=False)
model = load_model(device, args.model_path) with open('labels.json') as label_file: labels = json.load(label_file) decoder = load_decoder(decoder_type=args.decoder, labels=labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, lm_workers=args.lm_workers) target_decoder = GreedyDecoder(labels) test_dataset = SpectrogramDataset(audio_conf=model.audio_conf, manifest_filepath=args.test_manifest, labels=labels, normalize=True) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) wer, cer, output_data = evaluate(test_loader=test_loader, device=device, model=model, decoder=decoder, target_decoder=target_decoder,
def train(cfg): config = dict( epochs=cfg.training.epochs, batch_size=cfg.data.batch_size, learning_rate=cfg.optim.learning_rate, rnn_type=cfg.model.rnn_type, hidden_size=cfg.model.hidden_size, hidden_layers=cfg.model.hidden_layers, optimizer=cfg.optim, # update_hessian=cfg.optim.update_each ) wandb.init(project="adahessian-deepspeech", config=config) # Set seeds for determinism torch.manual_seed(cfg.training.seed) torch.cuda.manual_seed_all(cfg.training.seed) np.random.seed(cfg.training.seed) random.seed(cfg.training.seed) torch.backends.cudnn.flags(enabled=False) main_proc = True device = torch.device("cpu" if cfg.training.no_cuda else "cuda") is_distributed = os.environ.get( "LOCAL_RANK") # If local rank exists, distributed env if is_distributed: # when using NCCL, on failures, surviving nodes will deadlock on NCCL ops # because NCCL uses a spin-lock on the device. Set this env var and # to enable a watchdog thread that will destroy stale NCCL communicators os.environ["NCCL_BLOCKING_WAIT"] = "1" device_id = int(os.environ["LOCAL_RANK"]) torch.cuda.set_device(device_id) print(f"Setting CUDA Device to {device_id}") dist.init_process_group(backend=cfg.training.dist_backend.value) main_proc = device_id == 0 # Main process handles saving of models and reporting if OmegaConf.get_type(cfg.checkpointing) == FileCheckpointConfig: checkpoint_handler = FileCheckpointHandler(cfg=cfg.checkpointing) elif OmegaConf.get_type(cfg.checkpointing) == GCSCheckpointConfig: checkpoint_handler = GCSCheckpointHandler(cfg=cfg.checkpointing) else: raise ValueError("Checkpoint Config has not been specified correctly.") if main_proc and cfg.visualization.visdom: visdom_logger = VisdomLogger(id=cfg.visualization.id, num_epochs=cfg.training.epochs) if main_proc and cfg.visualization.tensorboard: tensorboard_logger = TensorBoardLogger( id=cfg.visualization.id, log_dir=to_absolute_path(cfg.visualization.log_dir), log_params=cfg.visualization.log_params) if cfg.checkpointing.load_auto_checkpoint: latest_checkpoint = checkpoint_handler.find_latest_checkpoint() if latest_checkpoint: cfg.checkpointing.continue_from = latest_checkpoint if cfg.checkpointing.continue_from: # Starting from previous model state = TrainingState.load_state( state_path=to_absolute_path(cfg.checkpointing.continue_from)) model = state.model if cfg.training.finetune: state.init_finetune_states(cfg.training.epochs) if main_proc and cfg.visualization.visdom: # Add previous scores to visdom graph visdom_logger.load_previous_values(state.epoch, state.results) if main_proc and cfg.visualization.tensorboard: # Previous scores to tensorboard logs tensorboard_logger.load_previous_values(state.epoch, state.results) else: # Initialise new model training with open(to_absolute_path(cfg.data.labels_path)) as label_file: labels = json.load(label_file) if OmegaConf.get_type(cfg.model) is BiDirectionalConfig: model = DeepSpeech( rnn_hidden_size=cfg.model.hidden_size, nb_layers=cfg.model.hidden_layers, labels=labels, rnn_type=supported_rnns[cfg.model.rnn_type.value], audio_conf=cfg.data.spect, bidirectional=True) elif OmegaConf.get_type(cfg.model) is UniDirectionalConfig: model = DeepSpeech( rnn_hidden_size=cfg.model.hidden_size, nb_layers=cfg.model.hidden_layers, labels=labels, rnn_type=supported_rnns[cfg.model.rnn_type.value], audio_conf=cfg.data.spect, bidirectional=False, context=cfg.model.lookahead_context) else: raise ValueError("Model Config has not been specified correctly.") state = TrainingState(model=model) state.init_results_tracking(epochs=cfg.training.epochs) # Data setup evaluation_decoder = GreedyDecoder( model.labels) # Decoder used for validation train_dataset = SpectrogramDataset(audio_conf=model.audio_conf, manifest_filepath=to_absolute_path( cfg.data.train_manifest), labels=model.labels, normalize=True, augmentation_conf=cfg.data.augmentation) test_dataset = SpectrogramDataset(audio_conf=model.audio_conf, manifest_filepath=to_absolute_path( cfg.data.val_manifest), labels=model.labels, normalize=True) if not is_distributed: train_sampler = DSRandomSampler(dataset=train_dataset, batch_size=cfg.data.batch_size, start_index=state.training_step) else: train_sampler = DSElasticDistributedSampler( dataset=train_dataset, batch_size=cfg.data.batch_size, start_index=state.training_step) train_loader = AudioDataLoader(dataset=train_dataset, num_workers=cfg.data.num_workers, batch_sampler=train_sampler) test_loader = AudioDataLoader(dataset=test_dataset, num_workers=cfg.data.num_workers, batch_size=cfg.data.batch_size) model = model.to(device) parameters = model.parameters() if OmegaConf.get_type(cfg.optim) is SGDConfig: optimizer = torch.optim.SGD(parameters, lr=cfg.optim.learning_rate, momentum=cfg.optim.momentum, nesterov=True, weight_decay=cfg.optim.weight_decay) elif OmegaConf.get_type(cfg.optim) is AdamConfig: optimizer = torch.optim.AdamW(parameters, lr=cfg.optim.learning_rate, betas=cfg.optim.betas, eps=cfg.optim.eps, weight_decay=cfg.optim.weight_decay) elif OmegaConf.get_type(cfg.optim) is AdaHessianConfig: optimizer = AdaHessian( parameters, lr=cfg.optim.learning_rate, betas=cfg.optim.betas, eps=cfg.optim.eps, weight_decay=cfg.optim.weight_decay, update_each=cfg.optim.update_each, # average_conv_kernel=cfg.optim.average_conv_kernel, # hessian_power=cfg.optim.hessian_power ) torch.backends.cudnn.enabled = False else: raise ValueError("Optimizer has not been specified correctly.") if OmegaConf.get_type(cfg.optim) is not AdaHessianConfig: model, optimizer = amp.initialize(model, optimizer, enabled=not cfg.training.no_cuda, opt_level=cfg.apex.opt_level, loss_scale=cfg.apex.loss_scale) if state.optim_state is not None: optimizer.load_state_dict(state.optim_state) if state.amp_state is not None: amp.load_state_dict(state.amp_state) # Track states for optimizer/amp state.track_optim_state(optimizer) if not cfg.training.no_cuda and OmegaConf.get_type( cfg.optim) is not AdaHessianConfig: state.track_amp_state(amp) if is_distributed: model = DistributedDataParallel(model, device_ids=[device_id]) print(model) print("Number of parameters: %d" % DeepSpeech.get_param_size(model)) criterion = CTCLoss() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() for epoch in range(state.epoch, cfg.training.epochs): model.train() end = time.time() start_epoch_time = time.time() state.set_epoch(epoch=epoch) train_sampler.set_epoch(epoch=epoch) train_sampler.reset_training_step(training_step=state.training_step) for i, (data) in enumerate(train_loader, start=state.training_step): state.set_training_step(training_step=i) inputs, targets, input_percentages, target_sizes = data input_sizes = input_percentages.mul_(int(inputs.size(3))).int() # measure data loading time data_time.update(time.time() - end) inputs = inputs.to(device) out, output_sizes = model(inputs, input_sizes) out = out.transpose(0, 1) # TxNxH float_out = out.float() # ensure float32 for loss loss = criterion(float_out, targets, output_sizes, target_sizes).to(device) loss = loss / inputs.size(0) # average the loss by minibatch loss_value = loss.item() # Check to ensure valid loss was calculated valid_loss, error = check_loss(loss, loss_value) if valid_loss: optimizer.zero_grad() # compute gradient if OmegaConf.get_type(cfg.optim) is AdaHessianConfig: loss.backward(create_graph=True) else: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), cfg.optim.max_norm) optimizer.step() else: print(error) print('Skipping grad update') loss_value = 0 state.avg_loss += loss_value losses.update(loss_value, inputs.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( (epoch + 1), (i + 1), len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses)) if main_proc and cfg.checkpointing.checkpoint_per_iteration: checkpoint_handler.save_iter_checkpoint_model(epoch=epoch, i=i, state=state) del loss, out, float_out state.avg_loss /= len(train_dataset) epoch_time = time.time() - start_epoch_time print('Training Summary Epoch: [{0}]\t' 'Time taken (s): {epoch_time:.0f}\t' 'Average Loss {loss:.3f}\t'.format(epoch + 1, epoch_time=epoch_time, loss=state.avg_loss)) with torch.no_grad(): wer, cer, output_data = run_evaluation( test_loader=test_loader, device=device, model=model, decoder=evaluation_decoder, target_decoder=evaluation_decoder) state.add_results(epoch=epoch, loss_result=state.avg_loss, wer_result=wer, cer_result=cer) print('Validation Summary Epoch: [{0}]\t' 'Average WER {wer:.3f}\t' 'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer, cer=cer)) if main_proc and cfg.visualization.visdom: visdom_logger.update(epoch, state.result_state) if main_proc and cfg.visualization.tensorboard: tensorboard_logger.update(epoch, state.result_state, model.named_parameters()) if main_proc and cfg.visualization.wandb: wandb.log({ 'epoch': epoch, 'Average Loss': state.avg_loss, 'Average WER': wer, 'Average CER': cer }) if main_proc and cfg.checkpointing.checkpoint: # Save epoch checkpoint checkpoint_handler.save_checkpoint_model(epoch=epoch, state=state) # anneal lr for g in optimizer.param_groups: g['lr'] = g['lr'] / cfg.optim.learning_anneal print('Learning rate annealed to: {lr:.6f}'.format(lr=g['lr'])) wandb.log({"lr": g['lr']}) if main_proc and (state.best_wer is None or state.best_wer > wer): checkpoint_handler.save_best_model(epoch=epoch, state=state) state.set_best_wer(wer) state.reset_avg_loss() state.reset_training_step() # Reset training step for next epoch
def train(cfg): # Set seeds for determinism torch.manual_seed(cfg.training.seed) torch.cuda.manual_seed_all(cfg.training.seed) np.random.seed(cfg.training.seed) random.seed(cfg.training.seed) main_proc = True device = torch.device("cpu" if cfg.training.no_cuda else "cuda") is_distributed = os.environ.get( "LOCAL_RANK") # If local rank exists, distributed env if is_distributed: # when using NCCL, on failures, surviving nodes will deadlock on NCCL ops # because NCCL uses a spin-lock on the device. Set this env var and # to enable a watchdog thread that will destroy stale NCCL communicators os.environ["NCCL_BLOCKING_WAIT"] = "1" device_id = int(os.environ["LOCAL_RANK"]) torch.cuda.set_device(device_id) print(f"Setting CUDA Device to {device_id}") dist.init_process_group(backend=cfg.training.dist_backend.value) main_proc = device_id == 0 # Main process handles saving of models and reporting if OmegaConf.get_type(cfg.checkpointing) == FileCheckpointConfig: checkpoint_handler = FileCheckpointHandler(cfg=cfg.checkpointing) elif OmegaConf.get_type(cfg.checkpointing) == GCSCheckpointConfig: checkpoint_handler = GCSCheckpointHandler(cfg=cfg.checkpointing) else: raise ValueError("Checkpoint Config has not been specified correctly.") if main_proc and cfg.visualization.visdom: visdom_logger = VisdomLogger(id=cfg.visualization.id, num_epochs=cfg.training.epochs) if main_proc and cfg.visualization.tensorboard: tensorboard_logger = TensorBoardLogger( id=cfg.visualization.id, log_dir=to_absolute_path(cfg.visualization.log_dir), log_params=cfg.visualization.log_params) if cfg.checkpointing.load_auto_checkpoint: latest_checkpoint = checkpoint_handler.find_latest_checkpoint() if latest_checkpoint: cfg.checkpointing.continue_from = latest_checkpoint if cfg.checkpointing.continue_from: # Starting from previous model state = TrainingState.load_state( state_path=to_absolute_path(cfg.checkpointing.continue_from)) model = state.model if cfg.training.finetune: state.init_finetune_states(cfg.training.epochs) if main_proc and cfg.visualization.visdom: # Add previous scores to visdom graph visdom_logger.load_previous_values(state.epoch, state.results) if main_proc and cfg.visualization.tensorboard: # Previous scores to tensorboard logs tensorboard_logger.load_previous_values(state.epoch, state.results) else: # Initialise new model training with open(to_absolute_path(cfg.data.labels_path)) as label_file: labels = json.load(label_file) # #cấu hình của model trong file train_config.py dòng 51 # @dataclass # class BiDirectionalConfig: # rnn_type: RNNType = RNNType.lstm # Type of RNN to use in model # hidden_size: int = 1024 # Hidden size of RNN Layer # hidden_layers: int = 7 # Number of RNN layers if OmegaConf.get_type(cfg.model) is BiDirectionalConfig: model = DeepSpeech( rnn_hidden_size=cfg.model.hidden_size, nb_layers=cfg.model.hidden_layers, labels=labels, rnn_type=supported_rnns[cfg.model.rnn_type.value], audio_conf=cfg.data.spect, bidirectional=True) elif OmegaConf.get_type(cfg.model) is UniDirectionalConfig: model = DeepSpeech( rnn_hidden_size=cfg.model.hidden_size, nb_layers=cfg.model.hidden_layers, labels=labels, rnn_type=supported_rnns[cfg.model.rnn_type.value], audio_conf=cfg.data.spect, bidirectional=False, context=cfg.model.lookahead_context) else: raise ValueError("Model Config has not been specified correctly.") state = TrainingState(model=model) state.init_results_tracking(epochs=cfg.training.epochs) # Data setup evaluation_decoder = GreedyDecoder( model.labels) # Decoder used for validation train_dataset = SpectrogramDataset( audio_conf=model.audio_conf, manifest_filepath=to_absolute_path(cfg.data.train_manifest), labels=model.labels, normalize=True, augmentation_conf=cfg.data.augmentation ) #cấu hình spect, ids=[[dòng 1],[dognf 2]..], lables_=dict test_dataset = SpectrogramDataset(audio_conf=model.audio_conf, manifest_filepath=to_absolute_path( cfg.data.val_manifest), labels=model.labels, normalize=True) if not is_distributed: train_sampler = DSRandomSampler( dataset=train_dataset, batch_size=cfg.data.batch_size, start_index=state.training_step ) #DSRandomSampler để chọn 1 bộ minibatch bất kì và xáo trộn nội dung trong minibatch else: train_sampler = DSElasticDistributedSampler( dataset=train_dataset, batch_size=cfg.data.batch_size, start_index=state.training_step) train_loader = AudioDataLoader( dataset=train_dataset, num_workers=cfg.data.num_workers, batch_sampler=train_sampler ) #AudioLoader có hàm collate_fn để xử lí 1 minibatch được chọn, trả ra cuối cùng là mảng có 835 phần tử(đối với FPT, VIVOS), mỗi phần tử của audio loader là 1 mảng gồm batch_size mẫu test_loader = AudioDataLoader(dataset=test_dataset, num_workers=cfg.data.num_workers, batch_size=cfg.data.batch_size) model = model.to(device) parameters = model.parameters() if OmegaConf.get_type( cfg.optim) is SGDConfig: #mặc định ở dòng 8 trong train_config optimizer = torch.optim.SGD(parameters, lr=cfg.optim.learning_rate, momentum=cfg.optim.momentum, nesterov=True, weight_decay=cfg.optim.weight_decay) elif OmegaConf.get_type(cfg.optim) is AdamConfig: optimizer = torch.optim.AdamW(parameters, lr=cfg.optim.learning_rate, betas=cfg.optim.betas, eps=cfg.optim.eps, weight_decay=cfg.optim.weight_decay) else: raise ValueError("Optimizer has not been specified correctly.") model, optimizer = amp.initialize(model, optimizer, enabled=not cfg.training.no_cuda, opt_level=cfg.apex.opt_level, loss_scale=cfg.apex.loss_scale) if state.optim_state is not None: optimizer.load_state_dict(state.optim_state) if state.amp_state is not None: amp.load_state_dict(state.amp_state) # Track states for optimizer/amp state.track_optim_state(optimizer) if not cfg.training.no_cuda: state.track_amp_state(amp) if is_distributed: model = DistributedDataParallel(model, device_ids=[device_id]) #print(model) print("Number of parameters: %d" % DeepSpeech.get_param_size(model)) criterion = CTCLoss() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() for epoch in range( state.epoch, cfg.training.epochs): #1 epoch là duyệt hết cả csv của train model.train() end = time.time() start_epoch_time = time.time() state.set_epoch(epoch=epoch) train_sampler.set_epoch(epoch=epoch) train_sampler.reset_training_step(training_step=state.training_step) for i, (data) in enumerate(train_loader, start=state.training_step ): #duyệt qua từng minibatch (gồm 32 mẫu) state.set_training_step(training_step=i) inputs, targets, input_percentages, target_sizes = data #inputs[x][0] chứ spect thứ x trong batchsixe, input_percenttages: tỉ lệ độ dài câu từng câu nói trong minibatch/độ dài max, target: array [[...mã ascii]] input_sizes = input_percentages.mul_(int(inputs.size(3))).int( ) #tensor([699, 682, 656, 560, 553, 517, 514, 502, 464, 458, 423, 412, 406, 349, ...] laayss tỉ lệ X độ dài max để ra độ dài thực sự từng câu nói # measure data loading time data_time.update(time.time() - end) inputs = inputs.to(device) #đưa inputs gồm 32 mẫu qua mô hình học sâu, và kích thước thực sự từng câu nói (độ dài bước thời gian phổ) out, output_sizes = model( inputs, input_sizes ) #như out: 3 chiều, outputsize: 1 chiều : chứa kích thước mô hình dự đoán cho văn bản kq out = out.transpose( 0, 1 ) # TxNxH sau khi tranpose : out 3 chiều (bị đổi chiều 0 và 1)=> out (190x3x93) float_out = out.float() # ensure float32 for loss loss = criterion(float_out, targets, output_sizes, target_sizes).to(device) loss = loss / inputs.size( 0 ) # average the loss by minibatch, tổng loss chia cho số spect trong batch đó loss_value = loss.item() # Check to ensure valid loss was calculated valid_loss, error = check_loss(loss, loss_value) if valid_loss: optimizer.zero_grad() # compute gradient, SGD chuẩn hóa SGD cập nhật trọng số with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), cfg.optim.max_norm) optimizer.step() else: print(error) print('Skipping grad update') loss_value = 0 state.avg_loss += loss_value losses.update(loss_value, inputs.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( (epoch + 1), (i + 1), len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses)) if main_proc and cfg.checkpointing.checkpoint_per_iteration: checkpoint_handler.save_iter_checkpoint_model(epoch=epoch, i=i, state=state) del loss, out, float_out state.avg_loss /= len(train_dataset) epoch_time = time.time() - start_epoch_time print('Training Summary Epoch: [{0}]\t' 'Time taken (s): {epoch_time:.0f}\t' 'Average Loss {loss:.3f}\t'.format(epoch + 1, epoch_time=epoch_time, loss=state.avg_loss)) mylogg2er.info('Training Summary Epoch: [{0}]\t' 'Time taken (s): {epoch_time:.0f}\t' 'Average Loss {loss:.3f}\n'.format( epoch + 1, epoch_time=epoch_time, loss=state.avg_loss)) file_object = open('/root/epoch.log', 'a') file_object.write('Training Summary Epoch: [{0}]\t' 'Time taken (s): {epoch_time:.0f}\t' 'Average Loss {loss:.3f}\n'.format( epoch + 1, epoch_time=epoch_time, loss=state.avg_loss)) file_object.close() with torch.no_grad(): wer, cer, output_data, wer2, cer2 = run_evaluation( test_loader=test_loader, device=device, model=model, decoder=evaluation_decoder, target_decoder=evaluation_decoder) state.add_results(epoch=epoch, loss_result=state.avg_loss, wer_result=wer, cer_result=cer) print('Validation Summary Epoch: [{0}]\t' 'Average WER {wer:.3f}\t' 'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer, cer=cer)) # mylogg2er.info('Validation Summary Epoch: [{0}]\t' # 'Average WER {wer:.3f}\t' # 'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer, cer=cer)) file_object = open('/root/epoch.log', 'a') file_object.write('Validation Summary Epoch: [{0}]\t' 'Average WER {wer:.3f}\t' 'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer, cer=cer)) file_object.write('Validation Summary Epoch: [{0}]\t' 'Average WER2 {wer:.3f}\t' 'Average CER2 {cer:.3f}\n'.format(epoch + 1, wer=wer2, cer=cer2)) file_object.close() if main_proc and cfg.visualization.visdom: visdom_logger.update(epoch, state.result_state) if main_proc and cfg.visualization.tensorboard: tensorboard_logger.update(epoch, state.result_state, model.named_parameters()) if main_proc and cfg.checkpointing.checkpoint: # Save epoch checkpoint checkpoint_handler.save_checkpoint_model(epoch=epoch, state=state) # anneal lr for g in optimizer.param_groups: g['lr'] = g['lr'] / cfg.optim.learning_anneal print('Learning rate annealed to: {lr:.6f}'.format(lr=g['lr'])) file_object = open('/root/epoch.log', 'a') file_object.write( 'Learning rate annealed to: {lr:.6f}\n'.format(lr=g['lr'])) file_object.close() try: # print('Training Summary Epoch: [{0}]\t' # 'Time taken (s): {epoch_time:.0f}\t' # 'Average Loss {loss:.3f}\t'.format(epoch + 1, epoch_time=epoch_time, loss=state.avg_loss))///////// note = "Đổi tham số train_config: type: rnn.gru epochs: int = 50, batch_size: int = 30, hidden_size: int = 1600, hidden_layers: int = 7, file train_manifest: vinfptunk_train.csv, vinfptunk_dev.csv", sendReport(epoch + 1, '{:.3f}'.format(epoch_time), '{:.3f}'.format(state.avg_loss), '{:.3f}'.format(wer), '{:.3f}'.format(cer), "{:.6f}".format(g['lr']), note) except Exception as esss: print('Error :', esss) if main_proc and (state.best_wer is None or state.best_wer > wer): checkpoint_handler.save_best_model(epoch=epoch, state=state) state.set_best_wer(wer) state.reset_avg_loss() state.reset_training_step() # Reset training step for next epoch
def __init__(self, labels: List, model_cfg: Union[UniDirectionalConfig, BiDirectionalConfig], precision: int, optim_cfg: Union[AdamConfig, SGDConfig], spect_cfg: SpectConfig ): super().__init__() self.save_hyperparameters() self.model_cfg = model_cfg self.precision = precision self.optim_cfg = optim_cfg self.spect_cfg = spect_cfg self.bidirectional = True if OmegaConf.get_type(model_cfg) is BiDirectionalConfig else False print('OMEGA CONF \n', model_cfg) self.labels = labels num_classes = len(self.labels) self.conv = MaskConv(nn.Sequential( nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5)), nn.BatchNorm2d(32), nn.Hardtanh(0, 20, inplace=True), nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5)), nn.BatchNorm2d(32), nn.Hardtanh(0, 20, inplace=True) )) # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1 rnn_input_size = int(math.floor((self.spect_cfg.sample_rate * self.spect_cfg.window_size) / 2) + 1) rnn_input_size = int(math.floor(rnn_input_size + 2 * 20 - 41) / 2 + 1) rnn_input_size = int(math.floor(rnn_input_size + 2 * 10 - 21) / 2 + 1) rnn_input_size *= 32 self.rnns = nn.Sequential( BatchRNN( input_size=rnn_input_size, hidden_size=self.model_cfg.hidden_size, rnn_type=self.model_cfg.rnn_type.value, bidirectional=self.bidirectional, batch_norm=False ), *( BatchRNN( input_size=self.model_cfg.hidden_size, hidden_size=self.model_cfg.hidden_size, rnn_type=self.model_cfg.rnn_type.value, bidirectional=self.bidirectional ) for x in range(self.model_cfg.hidden_layers - 1) ) ) self.lookahead = nn.Sequential( # consider adding batch norm? Lookahead(self.model_cfg.hidden_size, context=self.model_cfg.lookahead_context), nn.Hardtanh(0, 20, inplace=True) ) if not self.bidirectional else None fully_connected = nn.Sequential( nn.BatchNorm1d(self.model_cfg.hidden_size), nn.Linear(self.model_cfg.hidden_size, num_classes, bias=False) ) self.fc = nn.Sequential( SequenceWise(fully_connected), ) self.inference_softmax = InferenceBatchSoftmax() # self.criterion = CTCLoss(blank=self.labels.index('_'), reduction='sum', zero_infinity=True) self.criterion = LWLRAP(precision=self.precision) self.evaluation_decoder = GreedyDecoder(self.labels) # Decoder used for validation # self.wer = WordErrorRate( # decoder=self.evaluation_decoder, # target_decoder=self.evaluation_decoder # ) # self.cer = CharErrorRate( # decoder=self.evaluation_decoder, # target_decoder=self.evaluation_decoder # ) self.lwlrap = LWLRAP(precision=self.precision) self.loss = RMSELoss() self.softmax = nn.Softmax(dim=-1) # self.loss = nn.BCEWithLogitsLoss() self.sigmoid = nn.Sigmoid()
class DeepSpeech(pl.LightningModule): def __init__(self, labels: List, model_cfg: Union[UniDirectionalConfig, BiDirectionalConfig], precision: int, optim_cfg: Union[AdamConfig, SGDConfig], spect_cfg: SpectConfig): super().__init__() self.save_hyperparameters() self.model_cfg = model_cfg self.precision = precision self.optim_cfg = optim_cfg self.spect_cfg = spect_cfg self.bidirectional = True if OmegaConf.get_type( model_cfg) is BiDirectionalConfig else False self.labels = labels num_classes = len(self.labels) self.conv = MaskConv( nn.Sequential( nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5)), nn.BatchNorm2d(32), nn.Hardtanh(0, 20, inplace=True), nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5)), nn.BatchNorm2d(32), nn.Hardtanh(0, 20, inplace=True))) # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1 rnn_input_size = int( math.floor((self.spect_cfg.sample_rate * self.spect_cfg.window_size) / 2) + 1) rnn_input_size = int(math.floor(rnn_input_size + 2 * 20 - 41) / 2 + 1) rnn_input_size = int(math.floor(rnn_input_size + 2 * 10 - 21) / 2 + 1) rnn_input_size *= 32 self.rnns = nn.Sequential( BatchRNN(input_size=rnn_input_size, hidden_size=self.model_cfg.hidden_size, rnn_type=self.model_cfg.rnn_type.value, bidirectional=self.bidirectional, batch_norm=False), *(BatchRNN(input_size=self.model_cfg.hidden_size, hidden_size=self.model_cfg.hidden_size, rnn_type=self.model_cfg.rnn_type.value, bidirectional=self.bidirectional) for x in range(self.model_cfg.hidden_layers - 1))) self.lookahead = nn.Sequential( # consider adding batch norm? Lookahead(self.model_cfg.hidden_size, context=self.model_cfg.lookahead_context), nn.Hardtanh(0, 20, inplace=True)) if not self.bidirectional else None fully_connected = nn.Sequential( nn.BatchNorm1d(self.model_cfg.hidden_size), nn.Linear(self.model_cfg.hidden_size, num_classes, bias=False)) self.fc = nn.Sequential(SequenceWise(fully_connected), ) self.inference_softmax = InferenceBatchSoftmax() self.criterion = CTCLoss(blank=self.labels.index('_'), reduction='sum', zero_infinity=True) self.evaluation_decoder = GreedyDecoder( self.labels) # Decoder used for validation self.wer = WordErrorRate(decoder=self.evaluation_decoder, target_decoder=self.evaluation_decoder) self.cer = CharErrorRate(decoder=self.evaluation_decoder, target_decoder=self.evaluation_decoder) def forward(self, x, lengths): lengths = lengths.cpu().int() output_lengths = self.get_seq_lens(lengths) x, _ = self.conv(x, output_lengths) sizes = x.size() x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3]) # Collapse feature dimension x = x.transpose(1, 2).transpose(0, 1).contiguous() # TxNxH for rnn in self.rnns: x = rnn(x, output_lengths) if not self.bidirectional: # no need for lookahead layer in bidirectional x = self.lookahead(x) x = self.fc(x) x = x.transpose(0, 1) # identity in training mode, softmax in eval mode x = self.inference_softmax(x) return x, output_lengths def training_step(self, batch, batch_idx): inputs, targets, input_percentages, target_sizes = batch input_sizes = input_percentages.mul_(int(inputs.size(3))).int() out, output_sizes = self(inputs, input_sizes) out = out.transpose(0, 1) # TxNxH out = out.log_softmax(-1) loss = self.criterion(out, targets, output_sizes, target_sizes) return loss def validation_step(self, batch, batch_idx): inputs, targets, input_percentages, target_sizes = batch input_sizes = input_percentages.mul_(int(inputs.size(3))).int() inputs = inputs.to(self.device) with autocast(enabled=self.precision == 16): out, output_sizes = self(inputs, input_sizes) decoded_output, _ = self.evaluation_decoder.decode(out, output_sizes) self.wer(preds=out, preds_sizes=output_sizes, targets=targets, target_sizes=target_sizes) self.cer(preds=out, preds_sizes=output_sizes, targets=targets, target_sizes=target_sizes) self.log('wer', self.wer.compute(), prog_bar=True, on_epoch=True) self.log('cer', self.cer.compute(), prog_bar=True, on_epoch=True) def configure_optimizers(self): if OmegaConf.get_type(self.optim_cfg) is SGDConfig: optimizer = torch.optim.SGD( params=self.parameters(), lr=self.optim_cfg.learning_rate, momentum=self.optim_cfg.momentum, nesterov=True, weight_decay=self.optim_cfg.weight_decay) elif OmegaConf.get_type(self.optim_cfg) is AdamConfig: optimizer = torch.optim.AdamW( params=self.parameters(), lr=self.optim_cfg.learning_rate, betas=self.optim_cfg.betas, eps=self.optim_cfg.eps, weight_decay=self.optim_cfg.weight_decay) else: raise ValueError("Optimizer has not been specified correctly.") scheduler = torch.optim.lr_scheduler.ExponentialLR( optimizer=optimizer, gamma=self.optim_cfg.learning_anneal) return [optimizer], [scheduler] def get_seq_lens(self, input_length): """ Given a 1D Tensor or Variable containing integer sequence lengths, return a 1D tensor or variable containing the size sequences that will be output by the network. :param input_length: 1D Tensor :return: 1D Tensor scaled by model """ seq_len = input_length for m in self.conv.modules(): if type(m) == nn.modules.conv.Conv2d: seq_len = ((seq_len + 2 * m.padding[1] - m.dilation[1] * (m.kernel_size[1] - 1) - 1) // m.stride[1] + 1) return seq_len.int()
def run_quantsim_evaluation(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") import deepspeech_pytorch.model def wrapped_forward_function(self, x, lengths=None): if lengths is None: lengths = torch.IntTensor([_x.shape[0] for _x in x]) return self.infer(x, lengths) deepspeech_pytorch.model.DeepSpeech.infer = deepspeech_pytorch.model.DeepSpeech.forward deepspeech_pytorch.model.DeepSpeech.forward = wrapped_forward_function model = load_model(device=device, model_path=args.model_path, use_half=False) decoder = load_decoder(labels=model.labels, cfg=LMConfig) target_decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) def eval_func(model, iterations=None, device=device): test_dataset = SpectrogramDataset(audio_conf=model.audio_conf, manifest_filepath=args.test_manifest, labels=model.labels, normalize=True) if iterations is not None: test_dataset.size = iterations test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) wer, cer, output_data = run_evaluation(test_loader=test_loader, device=device, model=model, decoder=decoder, target_decoder=target_decoder, save_output=False, verbose=True, use_half=False) return wer, cer, output_data quant_scheme = QuantScheme.post_training_tf_enhanced sim = QuantizationSimModel(model.cpu(), input_shapes=tuple([1, 1, 161, 500]), quant_scheme=quant_scheme, default_param_bw=args.default_param_bw, default_output_bw=args.default_output_bw, config_file=args.quantsim_config_file) manually_configure_quant_ops(sim) sim.model.to(device) sim.compute_encodings(eval_func, forward_pass_callback_args=args.encodings_iterations) wer, cer, output_data = eval_func(sim.model, None) print('Average WER {:.4f}'.format(wer))
def train(cfg): # 결과를 저장하기 위한 txt파일 초기화 with open( "/home/jhjeong/jiho_deep/deepspeech.pytorch/jiho_result/result.txt", "w") as ff: ff.write("학습 시작! \n") # Set seeds for determinism torch.manual_seed(cfg.training.seed) torch.cuda.manual_seed_all(cfg.training.seed) np.random.seed(cfg.training.seed) random.seed(cfg.training.seed) main_proc = True device = torch.device("cpu" if cfg.training.no_cuda else "cuda") is_distributed = os.environ.get( "LOCAL_RANK") # If local rank exists, distributed env if is_distributed: # when using NCCL, on failures, surviving nodes will deadlock on NCCL ops # because NCCL uses a spin-lock on the device. Set this env var and # to enable a watchdog thread that will destroy stale NCCL communicators os.environ["NCCL_BLOCKING_WAIT"] = "1" device_id = int(os.environ["LOCAL_RANK"]) torch.cuda.set_device(device_id) print(f"Setting CUDA Device to {device_id}") dist.init_process_group(backend=cfg.training.dist_backend) main_proc = device_id == 0 # Main process handles saving of models and reporting checkpoint_handler = CheckpointHandler( save_folder=to_absolute_path(cfg.checkpointing.save_folder), best_val_model_name=cfg.checkpointing.best_val_model_name, checkpoint_per_iteration=cfg.checkpointing.checkpoint_per_iteration, save_n_recent_models=cfg.checkpointing.save_n_recent_models) #visdom 사용할건지 tensorboard 사용할건지 if main_proc and cfg.visualization.visdom: visdom_logger = VisdomLogger(id=cfg.visualization.id, num_epochs=cfg.training.epochs) if main_proc and cfg.visualization.tensorboard: tensorboard_logger = TensorBoardLogger( id=cfg.visualization.id, log_dir=to_absolute_path(cfg.visualization.log_dir), log_params=cfg.visualization.log_params) if cfg.checkpointing.load_auto_checkpoint: latest_checkpoint = checkpoint_handler.find_latest_checkpoint() if latest_checkpoint: cfg.checkpointing.continue_from = latest_checkpoint # 여기서 부터 if cfg.checkpointing.continue_from: # Starting from previous model state = TrainingState.load_state( state_path=to_absolute_path(cfg.checkpointing.continue_from)) model = state.model if cfg.training.finetune: state.init_finetune_states(cfg.training.epochs) if main_proc and cfg.visualization.visdom: # Add previous scores to visdom graph visdom_logger.load_previous_values(state.epoch, state.results) if main_proc and cfg.visualization.tensorboard: # Previous scores to tensorboard logs tensorboard_logger.load_previous_values(state.epoch, state.results) else: # Initialise new model training with open(to_absolute_path(cfg.data.labels_path)) as label_file: labels = json.load(label_file) # label(a,b,c ...) audio_conf = dict(sample_rate=cfg.data.sample_rate, window_size=cfg.data.window_size, window_stride=cfg.data.window_stride, window=cfg.data.window) if cfg.augmentation.noise_dir: audio_conf += dict(noise_dir=to_absolute_path( cfg.augmentation.noise_dir), noise_prob=cfg.augmentation.noise_prob, noise_levels=(cfg.augmentation.noise_min, cfg.augmentation.noise_max)) rnn_type = cfg.model.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" # DeepSpeech 모델을 생성 model = DeepSpeech(rnn_hidden_size=cfg.model.hidden_size, nb_layers=cfg.model.hidden_layers, labels=labels, rnn_type=supported_rnns[rnn_type], audio_conf=audio_conf, bidirectional=cfg.model.bidirectional) state = TrainingState(model=model) state.init_results_tracking(epochs=cfg.training.epochs) # Data setup evaluation_decoder = GreedyDecoder( model.labels) # Decoder used for validation # Data path 정리 train_dataset = SpectrogramDataset( audio_conf=model.audio_conf, manifest_filepath=to_absolute_path(cfg.data.train_manifest), labels=model.labels, normalize=True, speed_volume_perturb=cfg.augmentation.speed_volume_perturb, spec_augment=cfg.augmentation.spec_augment) test_dataset = SpectrogramDataset(audio_conf=model.audio_conf, manifest_filepath=to_absolute_path( cfg.data.val_manifest), labels=model.labels, normalize=True, speed_volume_perturb=False, spec_augment=False) if not is_distributed: train_sampler = DSRandomSampler(dataset=train_dataset, batch_size=cfg.data.batch_size, start_index=state.training_step) else: train_sampler = DSElasticDistributedSampler( dataset=train_dataset, batch_size=cfg.data.batch_size, start_index=state.training_step) # data load 하는 부분 train_loader = AudioDataLoader(dataset=train_dataset, num_workers=cfg.data.num_workers, batch_sampler=train_sampler) test_loader = AudioDataLoader(dataset=test_dataset, num_workers=cfg.data.num_workers, batch_size=cfg.data.batch_size) model = model.to(device) parameters = model.parameters() if cfg.optimizer.adam: optimizer = torch.optim.AdamW(parameters, lr=cfg.optimizer.learning_rate, betas=cfg.optimizer.betas, eps=cfg.optimizer.eps, weight_decay=cfg.optimizer.weight_decay) else: optimizer = torch.optim.SGD(parameters, lr=cfg.optimizer.learning_rate, momentum=cfg.optimizer.momentum, nesterov=True, weight_decay=cfg.optimizer.weight_decay) model, optimizer = amp.initialize(model, optimizer, opt_level=cfg.apex.opt_level, loss_scale=cfg.apex.loss_scale) if state.optim_state is not None: optimizer.load_state_dict(state.optim_state) amp.load_state_dict(state.amp_state) # Track states for optimizer/amp state.track_optim_state(optimizer) state.track_amp_state(amp) if is_distributed: model = DistributedDataParallel(model, device_ids=[device_id]) print(model) print("Number of parameters: %d" % DeepSpeech.get_param_size(model)) criterion = CTCLoss() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() for epoch in range(state.epoch, cfg.training.epochs): model.train() end = time.time() start_epoch_time = time.time() state.set_epoch(epoch=epoch) train_sampler.set_epoch(epoch=epoch) train_sampler.reset_training_step(training_step=state.training_step) #train data있는거 가져다 사용하겠다. for i, (data) in enumerate(train_loader, start=state.training_step): state.set_training_step(training_step=i) inputs, targets, input_percentages, target_sizes = data input_sizes = input_percentages.mul_(int(inputs.size(3))).int() # measure data loading time data_time.update(time.time() - end) inputs = inputs.to(device) out, output_sizes = model(inputs, input_sizes) out = out.transpose(0, 1) # TxNxH float_out = out.float() # ensure float32 for loss loss = criterion(float_out, targets, output_sizes, target_sizes).to(device) loss = loss / inputs.size(0) # average the loss by minibatch loss_value = loss.item() # Check to ensure valid loss was calculated valid_loss, error = check_loss(loss, loss_value) if valid_loss: optimizer.zero_grad() # compute gradient with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), cfg.optimizer.max_norm) optimizer.step() else: print(error) print('Skipping grad update') loss_value = 0 state.avg_loss += loss_value losses.update(loss_value, inputs.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( (epoch + 1), (i + 1), len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses)) if main_proc and cfg.checkpointing.checkpoint_per_iteration: checkpoint_handler.save_iter_checkpoint_model(epoch=epoch, i=i, state=state) del loss, out, float_out state.avg_loss /= len(train_dataset) epoch_time = time.time() - start_epoch_time print('Training Summary Epoch: [{0}]\t' 'Time taken (s): {epoch_time:.0f}\t' 'Average Loss {loss:.3f}\t'.format(epoch + 1, epoch_time=epoch_time, loss=state.avg_loss)) with open( "/home/jhjeong/jiho_deep/deepspeech.pytorch/jiho_result/result.txt", "a") as ff: ff.write("\n") ff.write("train -> ") ff.write("epoch : ") ff.write(str(epoch + 1)) ff.write(" loss : ") ff.write(str(state.avg_loss)) ff.write("\n") with torch.no_grad(): wer, cer, output_data = evaluate(test_loader=test_loader, device=device, model=model, decoder=evaluation_decoder, target_decoder=evaluation_decoder) state.add_results(epoch=epoch, loss_result=state.avg_loss, wer_result=wer, cer_result=cer) print('Validation Summary Epoch: [{0}]\t' 'Average CER {cer:.3f}\t'.format(epoch + 1, cer=cer)) with open( "/home/jhjeong/jiho_deep/deepspeech.pytorch/jiho_result/result.txt", "a") as ff: ff.write("\n") ff.write("val -> ") ff.write("epoch : ") ff.write(str(epoch + 1)) ff.write(" cer : ") ff.write(str(cer)) ff.write("\n") # 텐서보드에 업데이트함 if main_proc and cfg.visualization.visdom: visdom_logger.update(epoch, state.result_state) if main_proc and cfg.visualization.tensorboard: tensorboard_logger.update(epoch, state.result_state, model.named_parameters()) if main_proc and cfg.checkpointing.checkpoint: # Save epoch checkpoint checkpoint_handler.save_checkpoint_model(epoch=epoch, state=state) # anneal lr for g in optimizer.param_groups: g['lr'] = g['lr'] / cfg.optimizer.learning_anneal print('Learning rate annealed to: {lr:.6f}'.format(lr=g['lr'])) if main_proc and (state.best_wer is None or state.best_wer > wer): checkpoint_handler.save_best_model(epoch=epoch, state=state) state.set_best_wer(wer) state.reset_avg_loss() state.reset_training_step() # Reset training step for next epoch
if __name__ == '__main__': args = parser.parse_args() torch.set_grad_enabled(False) device = torch.device("cuda" if args.cuda else "cpu") model = load_model(device, args.model_path, args.half) decoder = load_decoder(decoder_type=args.decoder, labels=model.labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, lm_workers=args.lm_workers) target_decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) test_dataset = SpectrogramDataset(audio_conf=model.audio_conf, manifest_filepath=args.test_manifest, labels=model.labels, normalize=True) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) wer, cer, output_data = evaluate(test_loader=test_loader, device=device, model=model, decoder=decoder, target_decoder=target_decoder, save_output=args.save_output, verbose=args.verbose, half=args.half)
def run_evaluation(test_loader, device, model, decoder, target_decoder, save_output=None, verbose=False, use_half=False): model.eval() total_cer, total_wer, num_tokens, num_chars = 0, 0, 0, 0 total_cer2, total_wer2, num_tokens2, num_chars2 = 0, 0, 0, 0 output_data = [] for i, (data) in tqdm(enumerate(test_loader), total=len(test_loader)): inputs, targets, input_percentages, target_sizes = data input_sizes = input_percentages.mul_(int(inputs.size(3))).int( ) #độ dài 1 dòng trong spect của mẫu, input_sizes là 32 mẫu inputs = inputs.to(device) if use_half: inputs = inputs.half() #không thay đổi nhiều # unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size out, output_sizes = model(inputs, input_sizes) decoded_output, _ = decoder.decode(out, output_sizes) target_strings = target_decoder.convert_to_strings(split_targets) if save_output is not None: # add output to data array, and continue output_data.append((out.cpu(), output_sizes, target_strings)) # for x in range(len(target_strings)): # transcript, reference = decoded_output[x][0], target_strings[x][0] # wer_inst = decoder.wer(transcript, reference) # cer_inst = decoder.cer(transcript, reference) # total_wer += wer_inst # total_cer += cer_inst # num_tokens += len(reference.split()) # num_chars += len(reference.replace(' ', '')) # if verbose: # print("Ref:", reference.lower()) # print("Hyp:", transcript.lower()) # print("WER:", float(wer_inst) / len(reference.split()), # "CER:", float(cer_inst) / len(reference.replace(' ', '')), "\n") # wer = float(total_wer) / num_tokens # cer = float(total_cer) / num_chars ############ decoder2 = GreedyDecoder(labels=model.labels, blank_index=model.labels.index('_')) old_out, out_offsets = decoder2.decode(out, output_sizes) for x in range(len(target_strings)): transcript, reference = decoded_output[x][0], target_strings[x][0] wer_inst = decoder.wer(transcript, reference) cer_inst = decoder.cer(transcript, reference) total_wer += wer_inst total_cer += cer_inst num_tokens += len(reference.split()) num_chars += len(reference.replace(' ', '')) if verbose: print("TRUTH :", reference.lower()) print("Beam :", transcript.lower()) print("WER:", float(wer_inst) / len(reference.split()), "CER:", float(cer_inst) / len(reference.replace(' ', ''))) transcript2 = old_out[x][0] wer_inst2 = decoder2.wer(transcript2, reference) cer_inst2 = decoder2.cer(transcript2, reference) total_wer2 += wer_inst2 total_cer2 += cer_inst2 num_tokens2 += len(reference.split()) num_chars2 += len(reference.replace(' ', '')) if verbose: print("Greedy:", transcript2.lower()) print("WER2:", float(wer_inst2) / len(reference.split()), "CER2:", float(cer_inst2) / len(reference.replace(' ', '')), "\n") # if(total_wer!=total_wer2): # print("BUG HERE") wer = float(total_wer) / num_tokens cer = float(total_cer) / num_chars wer2 = float(total_wer2) / num_tokens2 cer2 = float(total_cer2) / num_chars2 ########## # for x in range(len(target_strings)): # transcript2=old_out[x][0] # wer_inst2 = decoder2.wer(transcript2, reference) # cer_inst2 = decoder2.cer(transcript2, reference) # total_wer2 += wer_inst2 # total_cer2 += cer_inst2 # num_tokens2 += len(reference.split()) # num_chars2 += len(reference.replace(' ', '')) # if verbose: # print("Old:",transcript2.lower()) # print("WER2:", float(wer_inst2) / len(reference.split()), # "CER2:", float(cer_inst2) / len(reference.replace(' ', '')), "\n") ################ return wer * 100, cer * 100, output_data, wer2 * 100, cer * 100