def CTCLoss(opt): ''' 持续连接损失函数 ''' try: from torch.nn import CTCLoss return CTCLoss() except: from warpctc_pytorch import CTCLoss return CTCLoss()
def __init__(self, opt, dataset_name='iam', reset_log=False): self.opt = opt self.mode = self.opt.mode self.dataset_name = dataset_name self.stn_nc = self.opt.stn_nc self.cnn_nc = self.opt.cnn_nc self.nheads = self.opt.nheads self.criterion = CTCLoss(blank=0, reduction='sum', zero_infinity=True) self.label_transform = self.init_label_transform() self.test_transforms = self.init_test_transforms() self.train_transforms = self.init_train_transforms() self.val1_iter = self.opt.val1_iter # Number of train data batches that will be validated self.val2_iter = self.opt.val2_iter # Number of validation data batches that will be validated self.stn_attn = None self.val_metric = 'cer' self.use_loc_bn = False self.CNN = 'ResCRNN' self.loc_block = 'LocNet' self.identity_matrix = torch.tensor([1, 0, 0, 0, 1, 0], dtype=torch.float).cuda() if self.mode == 'train': if len(self.opt.trainRoot) == 0: self.train_root = "/ssd_scratch/cvit/santhoshini/{}-train-lmdb".format(self.dataset_name) else: self.train_root = self.opt.trainRoot if len(self.opt.valRoot) == 0: self.test_root = "/ssd_scratch/cvit/santhoshini/{}-test-lmdb".format(self.dataset_name) else: self.test_root = self.opt.valRoot if not os.path.exists(self.opt.node_dir): os.makedirs(self.opt.node_dir) elif reset_log: shutil.rmtree(self.opt.node_dir) os.makedirs(self.opt.node_dir) random.seed(self.opt.manualSeed) np.random.seed(self.opt.manualSeed) torch.manual_seed(self.opt.manualSeed) # cudnn.benchmark = True cudnn.deterministic = True cudnn.benchmark = False cudnn.enabled = True # print('CudNN enabled', cudnn.enabled) if torch.cuda.is_available() and not self.opt.cuda: print("WARNING: You have a CUDA device, so you should probably run with --cuda") else: self.opt.gpu_id = list(map(int, self.opt.gpu_id.split(','))) torch.cuda.set_device(self.opt.gpu_id[0])
def main(): eval_batch_size = config["eval_batch_size"] cpu_workers = config["cpu_workers"] reload_checkpoint = config["reload_checkpoint"] img_height = config["img_height"] img_width = config["img_width"] device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"device: {device}") test_dataset = Synth90kDataset( root_dir=config["data_dir"], mode="test", img_height=img_height, img_width=img_width, ) test_loader = DataLoader( dataset=test_dataset, batch_size=eval_batch_size, shuffle=False, num_workers=cpu_workers, collate_fn=synth90k_collate_fn, ) num_class = len(Synth90kDataset.LABEL2CHAR) + 1 crnn = CRNN( 1, img_height, img_width, num_class, map_to_seq_hidden=config["map_to_seq_hidden"], rnn_hidden=config["rnn_hidden"], leaky_relu=config["leaky_relu"], ) crnn.load_state_dict(torch.load(reload_checkpoint, map_location=device)) crnn.to(device) criterion = CTCLoss(reduction="sum") criterion.to(device) evaluation = evaluate( crnn, test_loader, criterion, decode_method=config["decode_method"], beam_size=config["beam_size"], ) print("test_evaluation: loss={loss}, acc={acc}".format(**evaluation))
def main(): device = torch.device('cuda') model = CRNN(args.nc, args.nclass, args.nh) if args.pretrained: model.load_state_dict(torch.load(args.pretrained)) optimizer = optim.RMSprop(model.parameters(), lr=args.lr) criterion = CTCLoss(zero_infinity=True).cuda() if not os.path.exists(lmdb_train_path): create_lmdb(args.root, args.trainroot, args.valroot) if not os.path.exists(args.expr_dir): os.makedirs(args.expr_dir) if torch.cuda.is_available(): torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True train_loader, val_loader = get_data_loader(args) trainer = create_supervised_trainer(model, optimizer, criterion, device=device) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine: Engine): if engine.state.iteration % args.log_interval == 0: print("Epoch {} [{}/{}] :Loss {}".format( engine.state.epoch, engine.state.iteration % (len(train_loader)), len(train_loader), engine.state.output)) @trainer.on(Events.EPOCH_COMPLETED) def log_test_acc(engine): valid(model, val_loader) if engine.state.epoch % args.save_interval == 0: torch.save(model, f'{args.expr_dir}/crnn_{engine.state.epoch}.pth') trainer.run(train_loader, max_epochs=args.epochs)
def evaluate_batch(self, batch_data, metric_names): x = batch_data["imgs"].to(self.device) y = batch_data["labels"].to(self.device) y_len = batch_data["labels_len"] str_y = batch_data["raw_labels"] loss = 0 loss_ctc = CTCLoss(blank=self.dataset.tokens["blank"], reduction="mean") with autocast(enabled=self.params["training_params"]["use_amp"]): x = self.models["encoder"](x) global_pred = self.models["decoder"](x) ind_x = list() b, c, h, w = global_pred.size() for i in range(b): x_h, x_w = batch_data["imgs_reduced_shape"][i][:2] pred = global_pred[i, :, :x_h, :x_w] pred = pred.reshape(1, c, x_h * x_w) loss += loss_ctc(pred.permute(2, 0, 1), y[i].unsqueeze(0), [ x_h * x_w, ], [ y_len[i], ]) ind_x.append(torch.argmax(pred, dim=1).cpu().numpy()[0]) metrics = self.compute_metrics(ind_x, str_y, loss=loss.item(), metric_names=metric_names) if "pred" in metric_names: metrics["pred"].extend( [batch_data["unchanged_labels"], batch_data["names"]]) return metrics
def main(): text_image = TextImage('abc', 32, 96, 5, 2) data_set = Generator(text_image) data_loader = DataLoader(data_set, batch_size=32, shuffle=True) model = CrnnSmall(len(text_image.alpha) + 1, num_base_filters=8) criterion = CTCLoss() optimizer = optim.Adadelta(model.parameters(), weight_decay=1e-4) num_epochs = 10 for epoch in range(num_epochs): epoch_loss = 0.0 model.train() for image, target, input_len, target_len in tqdm(data_loader): # print(target, target_len, input_len) outputs = model(image.to(torch.float32)) # [B,N,C] m_outputs = outputs outputs = torch.log_softmax(outputs, dim=2).to(torch.float64) outputs = outputs.permute([1, 0, 2]) # [N,B,C] loss = criterion(outputs[:], target, input_len, target_len) # 梯度更新 model.zero_grad() loss.backward() optimizer.step() # 当前轮的loss epoch_loss += loss.item() * image.size(0) if np.isnan(loss.item()): print(target, m_outputs) epoch_loss = epoch_loss / len(data_loader.dataset) # 打印日志,保存权重 print('Epoch: {}/{} loss: {:03f}'.format(epoch + 1, num_epochs, epoch_loss))
def evaluate_batch(self, params): with torch.no_grad(): x, y, seq_len, seq_reduced_len, labels_len, _, _ = params x = torch.from_numpy(x).float().permute(0, 3, 1, 2).to(self.device) y = torch.from_numpy(y).long().to(self.device) for model_name in self.models.keys(): self.models[model_name].eval() loss_ctc = CTCLoss(blank=len(self.all_labels)) global_pred = self.models["end_to_end_model"](x) loss = loss_ctc(global_pred.permute(2, 0, 1), y, seq_reduced_len.tolist(), labels_len.tolist()) loss_val = loss.item() truth = [self.ctc_ind_to_str(i) for i in y] pred = [self.ctc_decode(pred) for pred in global_pred.permute(0, 2, 1)] edit = self.batch_edit(truth, pred) diff_len = self.batch_len(truth, pred) losses = {"loss_ctc": loss_val} metrics = {"edit": edit, "diff_len": diff_len} return losses, metrics
def run(args): test_dataset = TrainWordsDataset(data_set_dir=args.dataset_dir, transform=ToFloatTensor()) test_loader = DataLoader(dataset=test_dataset, batch_size=args.test_batch_size, shuffle=False, num_workers=4) model = CRNN(image_height=args.image_height, num_of_channels=args.num_of_channels, num_of_classes=args.num_of_classes, num_of_lstm_hidden_units=args.num_of_lstm_hidden_units) model.load_state_dict(torch.load(args.snapshot)) print(model) trainer = Trainer() criterion = CTCLoss(zero_infinity=True, reduction='mean') test_image = torch.FloatTensor(args.test_batch_size, 3, args.image_height, 512) test_image = Variable(test_image) trainer.test(model=model, test_loader=test_loader, criterion=criterion, test_image=test_image)
def __init__(self, vocab: Vocabulary, loss_ratio: float = 1.0, remove_sos: bool = True, remove_eos: bool = False, target_namespace: str = "tokens", initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(CTCLayer, self).__init__(vocab, regularizer) self.loss_ratio = loss_ratio self._remove_sos = remove_sos self._remove_eos = remove_eos self._target_namespace = target_namespace self._num_classes = self.vocab.get_vocab_size(target_namespace) self._pad_index = self.vocab.get_token_index(DEFAULT_PADDING_TOKEN, self._target_namespace) self._loss = CTCLoss(blank=self._pad_index) self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) exclude_indices = {self._pad_index, self._end_index, self._start_index} self._wer: Metric = WER(exclude_indices=exclude_indices) self._bleu: Metric = BLEU(exclude_indices=exclude_indices) self._dal: Metric = Average() initializer(self)
def main(config): train_loader, eval_loader = get_dataloader(config['data_loader']['type'], config['data_loader']['args']) if os.path.isfile(config['data_loader']['args']['alphabet']): config['data_loader']['args']['alphabet'] = str(np.load(config['data_loader']['args']['alphabet'])) prediction_type = config['arch']['args']['prediction']['type'] # label转换器设置 if prediction_type == 'CTC': converter = CTCLabelConverter(config['data_loader']['args']['alphabet']) else: converter = AttnLabelConverter(config['data_loader']['args']['alphabet']) num_class = len(converter.character) # loss 设置 if prediction_type == 'CTC': criterion = CTCLoss(zero_infinity=True).cuda() else: criterion = CrossEntropyLoss(ignore_index=0).cuda() # ignore [GO] token = ignore index 0 model = get_model(num_class, config) config['name'] = config['name'] + '_' + model.name trainer = Trainer(config=config, model=model, criterion=criterion, train_loader=train_loader, val_loader=eval_loader, converter=converter, weights_init=weights_init) trainer.train()
def train(net, optimizer, trainSet, valSet, use_gpu): ctc_loss = CTCLoss(blank=0, reduction='mean', zero_infinity = True) net.train() epoch = 0 print('Loading Dataset...') epoch_size = math.ceil(len(trainSet) / args.batch_size) max_iter = args.max_epoch * epoch_size start_iter = 0 t_loss = 0.0 print("Begin training...") for iteration in range(start_iter, max_iter): if iteration % epoch_size == 0: epoch += 1 epochnum.append(epoch) batch_iterator = iter(DataLoader(trainSet, args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=custom_collate_fn)) if epoch % 1 == 0 and epoch > 0: if args.num_gpu > 1: torch.save(net.module.state_dict(), os.path.join(args.weights_save_folder, 'epoch_' + str(epoch) + '.pth')) else: torch.save(net.state_dict(), os.path.join(args.weights_save_folder, 'epoch_' + str(epoch) + '.pth')) val(net, valSet, ctc_loss) load_t0 = time.time() images, labels, target_lengths, input_lengths = next(batch_iterator) if use_gpu: images = images.cuda() labels = labels.cuda() target_lengths = target_lengths.cuda() input_lengths = input_lengths.cuda() out = net(images) optimizer.zero_grad() loss = ctc_loss(log_probs=out, targets=labels, target_lengths=target_lengths, input_lengths=input_lengths) loss.backward() optimizer.step() load_t1 = time.time() batch_time = load_t1 - load_t0 eta = int(batch_time * (max_iter - iteration)) print('Epoch:{}/{} || Epochiter: {}/{} || Iter: {}/{} || Loss: {:.4f}|| Batchtime: {:.4f} s || ETA: {}'.format (epoch, args.max_epoch, (iteration % epoch_size) + 1, epoch_size, iteration + 1, max_iter, loss, batch_time, str(datetime.timedelta(seconds=eta)))) t_loss = t_loss + loss if ((iteration % epoch_size) + 1 == epoch_size): epochloss.append(t_loss/epoch_size) t_loss = 0.0 if args.num_gpu > 1: torch.save(net.module.state_dict(), os.path.join(args.weights_save_folder, 'Final-crnn.pth')) else: torch.save(net.state_dict(), os.path.join(args.weights_save_folder, 'Final-crnn.pth')) print('Finished Training')
def __init__(self, labels: List, model_cfg: Union[UniDirectionalConfig, BiDirectionalConfig, ConvolutionConfig], precision: int, optim_cfg: Union[AdamConfig, SGDConfig], spect_cfg: SpectConfig): super().__init__() self.save_hyperparameters() self.model_cfg = model_cfg self.precision = precision self.optim_cfg = optim_cfg self.spect_cfg = spect_cfg self.convolutional = True if OmegaConf.get_type( model_cfg) is ConvolutionConfig else False self.bidirectional = True if OmegaConf.get_type( model_cfg) is BiDirectionalConfig else False self.labels = labels self.conv = MaskConv( nn.Sequential( nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5)), nn.BatchNorm2d(32), nn.Hardtanh(0, 20, inplace=True), nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5)), nn.BatchNorm2d(32), nn.Hardtanh(0, 20, inplace=True))) # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1 rnn_input_size = int( math.floor((self.spect_cfg.sample_rate * self.spect_cfg.window_size) / 2) + 1) rnn_input_size = int(math.floor(rnn_input_size + 2 * 20 - 41) / 2 + 1) rnn_input_size = int(math.floor(rnn_input_size + 2 * 10 - 21) / 2 + 1) rnn_input_size *= 32 if self.convolutional is False: self.rnns, self.lookahead, self.fc = self._rnn_construct( rnn_input_size) else: self.deep_conv, self.fc = self._conv_construct(rnn_input_size) self.inference_softmax = InferenceBatchSoftmax() self.criterion = CTCLoss(blank=self.labels.index('_'), reduction='sum', zero_infinity=True) self.evaluation_decoder = GreedyDecoder( self.labels) # Decoder used for validation self.wer = WordErrorRate(decoder=self.evaluation_decoder, target_decoder=self.evaluation_decoder) self.cer = CharErrorRate(decoder=self.evaluation_decoder, target_decoder=self.evaluation_decoder)
def __init__(self,params:configargparse.Namespace): """ Calculates Loss, Accuracy, Perplexity Statistics : param argparse.Namespace params: The training options """ super(StatsCalculator,self).__init__() self.ignore_label = params.text_pad self.char_list = params.char_list self.criterion = CrossEntropyLoss(ignore_index=self.ignore_label,reduction="mean") self.ctc = CTCLoss(zero_infinity=True)
def __init__(self, freq_dim, output_dim, config): super().__init__(freq_dim, config) # include the blank token self.blank = output_dim fc_inp_dim = self.encoder_dim if config['encoder']['rnn']['bidirectional']: fc_inp_dim *= 2 self.fc = nn.Linear(fc_inp_dim, output_dim + 1) self.loss_func = CTCLoss(blank=self.blank)
def main(): eval_batch_size = config['eval_batch_size'] cpu_workers = config['cpu_workers'] reload_checkpoint = config['reload_checkpoint'] img_height = config['img_height'] img_width = config['img_width'] device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f'device: {device}') test_dataset = Synth90kDataset(root_dir=config['data_dir'], mode='test', img_height=img_height, img_width=img_width) test_loader = DataLoader(dataset=test_dataset, batch_size=eval_batch_size, shuffle=False, num_workers=cpu_workers, collate_fn=synth90k_collate_fn) num_class = len(Synth90kDataset.LABEL2CHAR) + 1 crnn = CRNN(1, img_height, img_width, num_class, map_to_seq_hidden=config['map_to_seq_hidden'], rnn_hidden=config['rnn_hidden'], leaky_relu=config['leaky_relu']) crnn.load_state_dict(torch.load(reload_checkpoint, map_location=device)) crnn.to(device) criterion = CTCLoss(reduction='sum') criterion.to(device) evaluation = evaluate(crnn, test_loader, criterion, decode_method=config['decode_method'], beam_size=config['beam_size']) print('test_evaluation: loss={loss}, acc={acc}'.format(**evaluation))
def __init__(self, hparams, decoder=None, sample_rate=16000): super(DeepSpeech, self).__init__() self.hparams = hparams self.decoder = decoder self.criterion = CTCLoss(reduction='sum', zero_infinity=True) self.wer = WordErrorRate(decoder=self.decoder, target_decoder=self.decoder) self.cer = CharErrorRate(decoder=self.decoder, target_decoder=self.decoder) self.hidden_size = hparams.hidden_size self.hidden_layers = hparams.hidden_layers self.conv = MaskConv( nn.Sequential( nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5)), nn.BatchNorm2d(32), nn.Hardtanh(0, 20, inplace=True), nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5)), nn.BatchNorm2d(32), nn.Hardtanh(0, 20, inplace=True))) # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1 rnn_input_size = int( math.floor((sample_rate * hparams.window_size) / 2) + 1) rnn_input_size = int(math.floor(rnn_input_size + 2 * 20 - 41) / 2 + 1) rnn_input_size = int(math.floor(rnn_input_size + 2 * 10 - 21) / 2 + 1) rnn_input_size *= 32 rnns = [] rnn = BatchRNN(input_size=rnn_input_size, hidden_size=self.hidden_size, bidirectional=True, batch_norm=False) rnns.append(('0', rnn)) for x in range(self.hidden_layers - 1): rnn = BatchRNN(input_size=self.hidden_size, hidden_size=self.hidden_size, bidirectional=True) rnns.append(('%d' % (x + 1), rnn)) self.rnns = nn.Sequential(OrderedDict(rnns)) fully_connected = nn.Sequential( nn.BatchNorm1d(self.hidden_size), nn.Linear(self.hidden_size, hparams.num_classes, bias=False)) self.fc = nn.Sequential(SequenceWise(fully_connected), ) self.inference_softmax = InferenceBatchSoftmax()
def __init__(self, paths: Paths) -> None: self.paths = paths self.writer = SummaryWriter(log_dir=paths.checkpoint_dir / 'tensorboard') self.ctc_loss = CTCLoss() # Used for generating plots longest_id = get_longest_mel_id(dataset_path=self.paths.data_dir / 'dataset.pkl') self.longest_mel = np.load(str(paths.mel_dir / f'{longest_id}.npy'), allow_pickle=False) self.longest_tokens = np.load(str(paths.token_dir / f'{longest_id}.npy'), allow_pickle=False)
def train(num_epochs, model, device, train_loader, val_loader, images, texts, lengths, converter, optimizer, lr_scheduler, prediction_dir, print_iter): criterion = CTCLoss() criterion.to(device) images = images.to(device) model.to(device) for epoch in range(num_epochs): print(epoch) count = 0 model.train() for i, datas in enumerate(train_loader): datas, targets = datas batch_size = datas.size(0) count += batch_size dataloader.loadData(images, datas) t, l = converter.encode(targets) dataloader.loadData(texts, t) dataloader.loadData(lengths, l) preds = model(images) preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size)) cost = criterion(preds, texts, preds_size, lengths) / batch_size model.zero_grad() cost.backward() optimizer.step() if count % print_iter < train_loader.batch_size: print('epoch {} [{}/{}]loss : {}'.format( epoch, count, len(train_loader.dataset), cost)) validation(model, device, val_loader, images, texts, lengths, converter, prediction_dir) save_model('{}'.format(epoch), model, optimizer, lr_scheduler) lr_scheduler.step()
def __init__(self, args): self.batch_size = 1 self.lr_crnn = args.lr_crnn self.lr_prep = args.lr_prep self.max_epochs = args.epoch self.inner_limit = args.inner_limit self.crnn_model_path = args.crnn_model self.sec_loss_scalar = args.scalar self.ocr_name = args.ocr self.std = args.std self.is_random_std = args.random_std torch.manual_seed(42) self.train_set = properties.pos_text_dataset_train self.validation_set = properties.pos_text_dataset_dev self.input_size = properties.input_size self.ocr = get_ocr_helper(self.ocr_name) self.char_to_index, self.index_to_char, self.vocab_size = get_char_maps( properties.char_set) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") if self.crnn_model_path == '': self.crnn_model = CRNN(self.vocab_size, False).to(self.device) else: self.crnn_model = torch.load( properties.crnn_model_path).to(self.device) self.crnn_model.register_backward_hook(self.crnn_model.backward_hook) self.prep_model = UNet().to(self.device) self.dataset = PatchDataset( properties.patch_dataset_train, pad=True, include_name=True) self.validation_set = PatchDataset( properties.patch_dataset_dev, pad=True) self.loader_train = torch.utils.data.DataLoader( self.dataset, batch_size=self.batch_size, shuffle=True, drop_last=True, collate_fn=PatchDataset.collate) self.train_set_size = len(self.dataset) self.val_set_size = len(self.validation_set) self.primary_loss_fn = CTCLoss().to(self.device) self.secondary_loss_fn = MSELoss().to(self.device) self.optimizer_crnn = optim.Adam( self.crnn_model.parameters(), lr=self.lr_crnn, weight_decay=0) self.optimizer_prep = optim.Adam( self.prep_model.parameters(), lr=self.lr_prep, weight_decay=0)
def main(config): train_loader, eval_loader = get_dataloader(config['data_loader']['type'], config['data_loader']['args']) converter = strLabelConverter(config['data_loader']['args']['alphabet']) criterion = CTCLoss(zero_infinity=True) model = get_model(config) trainer = Trainer(config=config, model=model, criterion=criterion, train_loader=train_loader, val_loader=eval_loader, converter=converter) trainer.train()
def __init__(self, model: PhonemeDetector, corpus: CorpusClass, val_corpus: CorpusClass = None, pretraining=False, kl_ratio=0.10, output_directory: str = "models", batch_size: int = 20, lr: float = 3e-5, accumulate_steps: int = 1, total_steps: int = 30000, thaw_after: int = 10000, output_model_every: int = 1000, checkpoint=None, device: str = 'cpu'): self.device = device self.total_steps = total_steps self.pretraining = pretraining self.output_model_every = output_model_every self.output_directory = output_directory self.accumulate_steps = accumulate_steps self.batch_size = batch_size self.lr = lr self.kl_ratio = kl_ratio self.accumulate_steps = accumulate_steps self.thaw_after = thaw_after os.makedirs(output_directory, exist_ok=True) self.model = model self.model.train() self.freeze() self.forced_aligner = ForcedAligner(self.model, n_beams=10) self.corpus = corpus self.val_corpus = val_corpus self.loss_fn = CTCLoss() self.optimizer = torch.optim.Adam(model.parameters(), lr=lr) self.memory_max_length = 300000 self.epoch = 0 if checkpoint is not None: self.epoch = checkpoint["epoch"] self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
def evaluate_batch(self, batch_data, metric_names): x = batch_data["imgs"].to(self.device) y = batch_data["labels"].to(self.device) x_reduced_len = [s[1] for s in batch_data["imgs_reduced_shape"]] y_len = batch_data["labels_len"] loss_ctc = CTCLoss(blank=self.dataset.tokens["blank"], reduction="sum") with autocast(enabled=self.params["training_params"]["use_amp"]): x = self.models["encoder"](x) global_pred = self.models["decoder"](x) loss = loss_ctc(global_pred.permute(2, 0, 1), y, x_reduced_len, y_len) pred = torch.argmax(global_pred, dim=1).cpu().numpy() metrics = self.compute_metrics(pred, y.cpu().numpy(), x_reduced_len, y_len, loss=loss.item(), metric_names=metric_names) if "pred" in metric_names: metrics["pred"].extend([batch_data["unchanged_labels"], batch_data["names"]]) return metrics
def main(config): import torch from torch.nn import CTCLoss from models import get_model from data_loader import get_dataloader from trainer import Trainer from utils import CTCLabelConverter, AttnLabelConverter, load if os.path.isfile(config['dataset']['alphabet']): config['dataset']['alphabet'] = ''.join(load(config['dataset']['alphabet'])) prediction_type = config['arch']['args']['prediction']['type'] # loss 设置 if prediction_type == 'CTC': criterion = CTCLoss(blank=0, zero_infinity=True) converter = CTCLabelConverter(config['dataset']['alphabet']) elif prediction_type == 'Attn': criterion = torch.nn.CrossEntropyLoss(ignore_index=0) converter = AttnLabelConverter(config['dataset']['alphabet']) else: raise NotImplementedError img_channel = 3 if config['dataset']['train']['dataset']['args']['img_mode'] != 'GRAY' else 1 model = get_model(img_channel, len(converter.character), config['arch']['args']) img_h, img_w = 32, 100 for process in config['dataset']['train']['dataset']['args']['pre_processes']: if process['type'] == "Resize": img_h = process['args']['img_h'] img_w = process['args']['img_w'] break sample_input = torch.zeros((2, img_channel, img_h, img_w)) num_label = model.get_batch_max_length(sample_input) train_loader = get_dataloader(config['dataset']['train'], num_label) assert train_loader is not None if 'validate' in config['dataset'] and config['dataset']['validate']['dataset']['args']['data_path'][0] is not None: validate_loader = get_dataloader(config['dataset']['validate'], num_label) else: validate_loader = None trainer = Trainer(config=config, model=model, criterion=criterion, train_loader=train_loader, validate_loader=validate_loader, sample_input=sample_input, converter=converter) trainer.train()
def __init__(self, args): self.ocr_name = args.ocr self.batch_size = args.batch_size self.lr = args.lr self.epochs = args.epoch self.std = args.std self.ocr = args.ocr self.p_samples = args.p self.sec_loss_scalar = args.scalar self.train_set = properties.vgg_text_dataset_train self.validation_set = properties.vgg_text_dataset_dev self.input_size = properties.input_size self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.prep_model = UNet().to(self.device) self.ocr = get_ocr_helper(self.ocr) self.char_to_index, self.index_to_char, self.vocab_size = get_char_maps( properties.char_set) self.loss_fn = CTCLoss(reduction='none').to(self.device) transform = transforms.Compose([ PadWhite(self.input_size), transforms.ToTensor(), ]) self.dataset = ImgDataset( self.train_set, transform=transform, include_name=True) self.validation_set = ImgDataset( self.validation_set, transform=transform, include_name=True) self.loader_train = torch.utils.data.DataLoader( self.dataset, batch_size=self.batch_size, shuffle=True, drop_last=True) self.loader_validation = torch.utils.data.DataLoader( self.validation_set, batch_size=self.batch_size, drop_last=True) self.val_set_size = len(self.validation_set) self.train_set_size = len(self.dataset) self.optimizer = optim.Adam( self.prep_model.parameters(), lr=self.lr, weight_decay=0) self.secondary_loss_fn = MSELoss().to(self.device)
def train_batch(self, batch_data, metric_names): x = batch_data["imgs"].to(self.device) y = batch_data["labels"].to(self.device) x_reduced_len = [s[1] for s in batch_data["imgs_reduced_shape"]] y_len = batch_data["labels_len"] loss_ctc = CTCLoss(blank=self.dataset.tokens["blank"], reduction="sum") self.optimizer.zero_grad() with autocast(enabled=self.params["training_params"]["use_amp"]): x = self.models["encoder"](x) global_pred = self.models["decoder"](x) loss = loss_ctc(global_pred.permute(2, 0, 1), y, x_reduced_len, y_len) self.backward_loss(loss) self.step_optimizer() pred = torch.argmax(global_pred, dim=1).cpu().numpy() metrics = self.compute_metrics(pred, y.cpu().numpy(), x_reduced_len, y_len, loss=loss.item(), metric_names=metric_names) return metrics
def __init__(self, *args, **kwargs): super(ModelIinit, self).__init__(*args, **kwargs) if self.model_params["model_type"] == "crnn_big_size": self.model = crnn_big_size.CRNN( nc=self.model_params["num_input_channels"], nclass=self.nclass, nh=self.model_params["hid_layer_size"]) self.converter = crnn_utils.strLabelConverter( self.model_params["alphabet"]) self.criterion = CTCLoss(zero_infinity=True).to( self.general_params["device"]) self.model.apply(self.weights_init) '''load pretrained weigths''' path_to_pretrained_model = self.model_params[ self.model_params["model_type"]]["path_pretrained"] if path_to_pretrained_model and os.path.isfile( path_to_pretrained_model): print('loading pretrained model') self.model.load_state_dict(path_to_pretrained_model) self.model.to(self.general_params["device"]) self.model = torch.nn.DataParallel(self.model, device_ids=range( self.general_params["num_gpu"])) '''optimizer initialise''' if self.model_params["optimizer"] == "Adam": self.optimizer = optim.Adam(self.model.parameters(), lr=self.model_params["adam"]["lr"], betas=(self.model_params["adam"]["lr"], 0.999)) elif self.model_params["optimizer"] == "adadelta": self.optimizer = optim.Adadelta(self.model.parameters(), lr=self.model_params["adam"]["lr"]) else: self.optimizer = optim.RMSprop(self.model.parameters(), lr=self.model_params["adam"]["lr"])
def predict_batch(self, params, metrics_name): with torch.no_grad(): x, y, seq_len, seq_reduced_len, labels_len, _, img_name = params x = torch.from_numpy(x).float().permute(0, 3, 1, 2).to(self.device) y = torch.from_numpy(y).long().to(self.device) for model_name in self.models.keys(): self.models[model_name].eval() global_pred = self.models["end_to_end_model"](x) truth = [self.ctc_ind_to_str(i) for i in y] pred = [ self.ctc_decode(pred) for pred in global_pred.permute(0, 2, 1) ] metrics = {} for key in metrics_name: if key == "cer": metrics[key] = edit_cer_from_list(truth, pred) if key == "wer": metrics[key] = edit_wer_from_list(truth, pred) metrics["nb_words"] = sum([len(t.split(" ")) for t in truth]) elif key == "pred": metrics[key] = pred elif key == "ground_truth": metrics[key] = truth elif key == "diff_len": metrics[key] = self.batch_len(truth, pred) elif key == "proba": metrics[key] = self.batch_probas_to_str( global_pred.cpu().detach().numpy(), img_name) elif key == "loss_ctc": ctc_loss = CTCLoss(blank=len(self.all_labels)) metrics[key] = ctc_loss(global_pred.permute(2, 0, 1), y, seq_reduced_len.tolist(), labels_len.tolist()).item() return metrics
def ctc_loss(outputs, targets, mask): USE_CUDA = torch.cuda.is_available() device = torch.device("cuda:0" if USE_CUDA else "cpu") target_lengths = torch.sum(mask, dim=0).to(device) # We need to change targets, PAD_token = 0 = blank # EOS token -> PAD_token targets[targets == EOS_token] = PAD_token outputs = outputs.log_softmax(2) input_lengths = outputs.size()[0] * torch.ones(outputs.size()[1], dtype=torch.int) loss_fn = CTCLoss(blank=PAD_token, zero_infinity=True) targets = targets.transpose(1, 0) # target_lengths have EOS token, we need minus one target_lengths = target_lengths - 1 targets = targets[:, :-1] # print(input_lengths, target_lengths) torch.backends.cudnn.enabled = False # TODO: NAN when target_length > input_length, we can increase size or use zero infinity loss = loss_fn(outputs, targets, input_lengths, target_lengths) torch.backends.cudnn.enabled = True return loss, loss.item()
def train_batch(self, batch_data, metric_names): x = batch_data["imgs"].to(self.device) y = batch_data["labels"].to(self.device) y_len = batch_data["labels_len"] str_y = batch_data["raw_labels"] loss = 0 loss_ctc = CTCLoss(blank=self.dataset.tokens["blank"], reduction="mean") self.optimizer.zero_grad() with autocast(enabled=self.params["training_params"]["use_amp"]): global_pred = self.models["decoder"](self.models["encoder"](x)) ind_x = list() b, c, h, w = global_pred.size() for i in range(b): x_h, x_w = batch_data["imgs_reduced_shape"][i][:2] pred = global_pred[i, :, :x_h, :x_w] pred = pred.reshape(1, c, x_h * x_w) torch.backends.cudnn.enabled = False loss += loss_ctc(pred.permute(2, 0, 1), y[i].unsqueeze(0), [ x_h * x_w, ], [ y_len[i], ]) torch.backends.cudnn.enabled = True ind_x.append(torch.argmax(pred, dim=1).cpu().numpy()[0]) del global_pred self.backward_loss(loss) self.step_optimizer() metrics = self.compute_metrics(ind_x, str_y, loss=loss.item(), metric_names=metric_names) return metrics
def __init__(self, opt, log_root='./'): super(AdversarialModel, self).__init__(opt, log_root) device = self.device self.lexicon = get_lexicon(self.opt.training.lexicon, get_true_alphabet(opt.dataset), max_length=self.opt.training.max_word_len) self.max_valid_image_width = self.opt.char_width * self.opt.training.max_word_len self.noise_dim = self.opt.GenModel.style_dim - self.opt.EncModel.style_dim generator = Generator(**opt.GenModel).to(device) style_encoder = StyleEncoder(**opt.EncModel).to(device) writer_identifier = WriterIdentifier(**opt.WidModel).to(device) discriminator = Discriminator(**opt.DiscModel).to(device) recognizer = Recognizer(**opt.OcrModel).to(device) self.models = Munch(G=generator, D=discriminator, R=recognizer, E=style_encoder, W=writer_identifier) self.ctc_loss = CTCLoss(zero_infinity=True, reduction='mean') self.classify_loss = CrossEntropyLoss()