def val(crnn, data_loader, criterion, converter, device, max_iter=100): print('Start val') crnn.eval() val_iter = iter(data_loader) n_correct = 0 loss_avg = utils.averager() max_iter = min(max_iter, len(data_loader)) for _ in range(max_iter): data = val_iter.next() cpu_images, text, length, cpu_texts = data image = cpu_images.to(device) batch_size = cpu_images.size(0) with torch.no_grad(): preds = crnn(image) preds_size = torch.IntTensor([preds.size(0)] * batch_size) cost = criterion(preds, text, preds_size, length) loss_avg.add(cost) _, preds = preds.max(2) preds = preds.transpose(1, 0).contiguous().view(-1) sim_preds = converter.decode(preds.data, preds_size.data, raw=False) for pred, target in zip(sim_preds, cpu_texts): if pred == target.lower(): n_correct += 1 raw_preds = converter.decode(preds.data, preds_size.data, raw=True)[:opt.n_test_disp] for raw_pred, pred, gt in zip(raw_preds, sim_preds, cpu_texts): print('%-20s => %-20s, gt: %-20s' % (raw_pred, pred, gt)) accuracy = n_correct / float(max_iter * opt.batchsize) print('Test loss: %f, accuray: %.2f%%' % (loss_avg.val(), accuracy * 100))
def val(net, dataset, criterion, max_iter=100): print('Start val') for p in crnn.parameters(): p.requires_grad = False net.eval() data_loader = torch.utils.data.DataLoader(dataset, shuffle=True, batch_size=opt.batchSize, num_workers=int(opt.workers)) val_iter = iter(data_loader) i = 0 n_correct = 0 loss_avg = utils.averager() max_iter = min(max_iter, len(data_loader)) for i in range(max_iter): data = val_iter.next() i += 1 cpu_images, cpu_texts = data batch_size = cpu_images.size(0) utils.loadData(image, cpu_images) t, l = converter.encode(cpu_texts) utils.loadData(text, t) utils.loadData(length, l) preds = crnn(image) preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size)) cost = criterion(preds, text, preds_size, length) / batch_size loss_avg.add(cost) _, preds = preds.max(2) preds = preds.squeeze(2) preds = preds.transpose(1, 0).contiguous().view(-1) sim_preds = converter.decode(preds.data, preds_size.data, raw=False) for pred, target in zip(sim_preds, cpu_texts): if pred == target.lower(): n_correct += 1 raw_preds = converter.decode(preds.data, preds_size.data, raw=True)[:opt.n_test_disp] for raw_pred, pred, gt in zip(raw_preds, sim_preds, cpu_texts): print('%-20s => %-20s, gt: %-20s' % (raw_pred, pred, gt)) accuracy = n_correct / float(max_iter * opt.batchSize) print('Test loss: %f, accuray: %f' % (loss_avg.val(), accuracy))
def main(opts): ## 1. 初始化模型 nclass = len(alphabet) + 1 # 训练ICDAR2015 model_name = 'E2E-MLT' net = ModelResNetSep2(attention=True, nclass=nclass) print("Using {0}".format(model_name)) learning_rate = opts.base_lr # optimizer = torch.optim.Adam(net.parameters(), lr=opts.base_lr, weight_decay=weight_decay) optimizer = optim.Adam(net.parameters(), lr=opts.base_lr, betas=(0.5, 0.999)) step_start = 0 ### //预训练模型初始化,第一种:只修改conv11的维度 model_dict = net.state_dict() if os.path.exists(opts.model): print('loading pretrained model from %s' % opts.model) pretrained_model = ModelResNetSep2(attention=True, nclass=7500) # pretrained model from:https://github.com/MichalBusta/E2E-MLT pretrained_model.load_state_dict(torch.load(opts.model)['state_dict']) pretrained_dict = pretrained_model.state_dict() pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict and 'conv11' not in k and 'rnn' not in k} model_dict.update(pretrained_dict) net.load_state_dict(model_dict) ### 第二种:直接接着前面训练 # if os.path.exists(opts.model): # print('loading model from %s' % args.model) # step_start, learning_rate = net_utils.load_net(args.model, net, optimizer) ### if opts.cuda: net.cuda() net.train() ## 2. 定义数据集 converter = strLabelConverter(alphabet) ctc_loss = CTCLoss() data_generator = data_gen.get_batch(num_workers=opts.num_readers, input_size=opts.input_size, batch_size=opts.batch_size, train_list=opts.train_list, geo_type=opts.geo_type) # dg_ocr = ocr_gen.get_batch(num_workers=2, # batch_size=opts.ocr_batch_size, # train_list=opts.ocr_feed_list, in_train=True, norm_height=norm_height, rgb=True) # 训练OCR识别的数据集 ## 3. 变量初始化 bbox_loss = averager(); seg_loss = averager(); angle_loss = averager() loss_ctc = averager(); train_loss = averager() ## 4. 开始训练 for step in range(step_start, opts.max_iters): # 读取数据 images, image_fns, score_maps, geo_maps, training_masks, gtso, lbso, gt_idxs = next(data_generator) im_data = net_utils.np_to_variable(images.transpose(0, 3, 1, 2), is_cuda=opts.cuda) start = time.time() try: seg_pred, roi_pred, angle_pred, features = net(im_data) except: import sys, traceback traceback.print_exc(file=sys.stdout) continue # for EAST loss smaps_var = net_utils.np_to_variable(score_maps, is_cuda=opts.cuda) training_mask_var = net_utils.np_to_variable(training_masks, is_cuda=opts.cuda) angle_gt = net_utils.np_to_variable(geo_maps[:, :, :, 4], is_cuda=opts.cuda) geo_gt = net_utils.np_to_variable(geo_maps[:, :, :, [0, 1, 2, 3]], is_cuda=opts.cuda) try: loss = net.loss(seg_pred, smaps_var, training_mask_var, angle_pred, angle_gt, roi_pred, geo_gt) except: import sys, traceback traceback.print_exc(file=sys.stdout) continue bbox_loss.add(net.box_loss_value.item()); seg_loss.add(net.segm_loss_value.item()); angle_loss.add(net.angle_loss_value.item()) # 训练ocr的部分 try: # 10000步之前都是用文字的标注区域训练的//E2E-MLT中采用的这种策略 if step > 10000 or True: #this is just extra augumentation step ... in early stage just slows down training ctcl, gt_target , gt_proc = process_boxes(images, im_data, seg_pred[0], roi_pred[0], angle_pred[0], score_maps, gt_idxs, gtso, lbso, features, net, ctc_loss, opts, converter, debug=opts.debug) loss_ctc.add(ctcl) loss = loss + ctcl.cuda() train_loss.add(loss.item()) net.zero_grad() optimizer.zero_grad() loss.backward() optimizer.step() except: import sys, traceback traceback.print_exc(file=sys.stdout) pass if step % opts.disp_interval == 0: end = time.time() # 计算耗时 ctc_loss_val2 = 0.0 print('epoch %d[%d], loss: %.3f, bbox_loss: %.3f, seg_loss: %.3f, ang_loss: %.3f, ctc_loss: %.3f, time %.3f' % ( step / 1000 * opts.batch_size, step, train_loss.val(), bbox_loss.val(), seg_loss.val(), angle_loss.val(), loss_ctc.val(), end-start)) # for save mode if step > step_start and (step % ((1000 / opts.batch_size)*20) == 0): # 20代保存一次 save_name = os.path.join(opts.save_path, '{}_{}.h5'.format(model_name, step)) state = {'step': step, 'learning_rate': learning_rate, 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict()} torch.save(state, save_name) print('save model: {}'.format(save_name)) train_loss.reset(); bbox_loss.reset(); seg_loss.reset(); angle_loss.reset(); loss_ctc.reset() # 避免超出了范围
decoder_path = 'model/decoder_%d.pth' % opt.loadModelEpoch print('loading pretrained decoder model from %s' % decoder_path) decoder.load_state_dict(torch.load(decoder_path)) text = torch.LongTensor(opt.batchSize * 5) length = torch.IntTensor(opt.batchSize) if opt.cuda: encoder.cuda() decoder.cuda() image = image.cuda() text = text.cuda() criterion = criterion.cuda() # loss averager loss_avg = utils.averager() # setup optimizer encoder_optimizer = optim.Adam(encoder.parameters(), lr=opt.lr, betas=(0.5, 0.999)) decoder_optimizer = optim.Adam(decoder.parameters(), lr=opt.lr, betas=(0.5, 0.999)) def trainBatch(encoder, decoder, criterion, encoder_optimizer, decoder_optimizer): data = train_iter.next() cpu_images, cpu_texts = data b = cpu_images.size(0)
def val(encoder, decoder, criterion, batchsize, dataset, teach_forcing=False, max_iter=100): for e, d in zip(encoder.parameters(), decoder.parameters()): e.requires_grad = False d.requires_grad = False encoder.eval() decoder.eval() data_loader = torch.utils.data.DataLoader(dataset, shuffle=False, batch_size=batchsize, num_workers=int(opt.workers)) val_iter = iter(data_loader) n_correct = 0 n_total = 0 loss_avg = utils.averager() max_iter = min(max_iter, len(data_loader)) # max_iter = len(data_loader) - 1 for i in range(max_iter): data = val_iter.next() i += 1 cpu_images, cpu_texts = data b = cpu_images.size(0) #utils.loadData(image, cpu_images) if opt.cuda: image = cpu_images.to('cuda') else: image = cpu_images target_variable = converter.encode(cpu_texts) n_total += len(cpu_texts[0]) + 1 # 还要准确预测出EOS停止位 decoded_words = [] decoded_label = [] decoder_attentions = torch.zeros( len(cpu_texts[0]) + 1, opt.max_width) encoder_outputs = encoder(image) # cnn+biLstm做特征提取 if opt.cuda: # cnn+biLstm做特征提取 target_variable = target_variable.cuda() decoder_input = target_variable[0].cuda( ) # 初始化decoder的开始,从0开始输出 decoder_hidden = decoder.initHidden(b).cuda() else: target_variable = target_variable decoder_input = target_variable[0] # 初始化decoder的开始,从0开始输出 decoder_hidden = decoder.initHidden(b) loss = 0.0 if not teach_forcing: # 预测的时候采用非强制策略,将前一次的输出,作为下一次的输入,直到标签为EOS_TOKEN时停止 for di in range(1, target_variable.shape[0]): # 最大字符串的长度 decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) loss += criterion(decoder_output, target_variable[di]) # 每次预测一个字符 loss_avg.add(loss) # ATTENTION VALUES #decoder_attention = decoder_attention.squeeze() #decoder_attentions[di-1,:decoder_attention.shape[0]] = decoder_attention.data topv, topi = decoder_output.data.topk(1) ni = topi.squeeze(1) decoder_input = ni if ni == EOS_TOKEN: decoded_words.append('<EOS>') decoded_label.append(EOS_TOKEN) break else: decoded_words.append(converter.decode(ni)) decoded_label.append(ni) # 计算正确个数 for pred, target in zip(decoded_label, target_variable[1:, :]): if pred == target: n_correct += 1 if i % 100 == 0: # 每100次输出一次 texts = cpu_texts[0] print('pred:%-20s, gt: %-20s' % (decoded_words, texts)) accuracy = n_correct / float(n_total) print('Test loss: %f, accuray: %f' % (loss_avg.val(), accuracy)) return (loss_avg.val(), accuracy)
image = torch.FloatTensor(opt.batchSize, 3, opt.imgH, opt.imgH) text = torch.LongTensor(opt.batchSize * 5) length = torch.IntTensor(opt.batchSize) if opt.cuda: encoder.cuda() decoder.cuda() # encoder = torch.nn.DataParallel(encoder, device_ids=range(opt.ngpu)) # decoder = torch.nn.DataParallel(decoder, device_ids=range(opt.ngpu)) image = image.cuda() text = text.cuda() criterion = criterion.cuda() # loss averager loss_avg = utils.averager() loss_epoch = utils.averager() # setup optimizer if opt.adam: encoder_optimizer = optim.Adam(encoder.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) decoder_optimizer = optim.Adam(decoder.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) elif opt.adadelta: optimizer = optim.Adadelta(encoder.parameters(), lr=opt.lr) else: encoder_optimizer = optim.RMSprop(encoder.parameters(), lr=opt.lr) decoder_optimizer = optim.RMSprop(decoder.parameters(), lr=opt.lr)
def val(cfg, encoder, decoder, criterion, batchsize, dataset, teach_forcing=False, max_iter=100, use_beam_search=False, mode='1D'): print('Start val') for e, d in zip(encoder.parameters(), decoder.parameters()): e.requires_grad = False d.requires_grad = False encoder.eval() decoder.eval() data_loader = torch.utils.data.DataLoader( dataset, shuffle=False, batch_size=batchsize, num_workers=int(opt.workers)) val_iter = iter(data_loader) n_correct = 0 n_total = 0 loss_avg = utils.averager() max_iter = min(max_iter, len(data_loader)) # max_iter = len(data_loader) - 1 for i in range(max_iter): data = val_iter.next() i += 1 cpu_images, cpu_texts = data b = cpu_images.size(0) utils.loadData(image, cpu_images) target_variable = converter.encode(cpu_texts) ##因为batch_size是1,所以这里的n_total是单个text的长度+EOS停止位的预测 n_total += len(cpu_texts[0]) + 1 # 还要准确预测出EOS停止位 decoded_words = [] decoded_label = [] #decoder_attentions = torch.zeros(len(cpu_texts[0]) + 1, opt.max_width) #encoder_outputs = encoder(image) # cnn+biLstm做特征提取 target_variable = target_variable.cuda() #decoder_input = target_variable[0].cuda() # 初始化decoder的开始,从0开始输出 #decoder_hidden = decoder.initHidden(b).cuda() loss = 0.0 encoder_outputs = encoder(image) # cnn+biLstm做特征提取 维度:70(280的宽度4倍下采样) × batchsize × 256(隐藏节点个数) if mode=='1D': decoder_input = target_variable[0].cuda() # 初始化decoder的开始,从0开始输出 (batch 个label的开始SOS:0), 维度batch size decoder_hidden = decoder.initHidden(b).cuda() #维度:(1, batch_size, self.hidden_size) else: bos_onehot = np.zeros((encoder_outputs.size(1), 1), dtype=np.int32) ##[B,1] bos_onehot[:, 0] = cfg.SEQUENCE.BOS_TOKEN ##0 decoder_input = torch.tensor(bos_onehot.tolist(), device=gpu_device) ##列表,[B,1],数字0 decoder_hidden = torch.zeros((encoder_outputs.size(1), 256), device=gpu_device) ##[B,256] ##测试函数val ##inference时words存储的是batch的word,最大长度32; ##decoded_scores是batch的每一个字符的置信度 ##detailed_decoded_scores只有采用beam_search时才有数,不然就是空列表,具体的存放的是指定topk的置信度,即预测每一个字符时保存topk个当前预测字符的置信度 ##例如一张图片预测出10个框(batch是10),每个batch中在预测具体的字符,这样words就是10个,decoded_scores是10个words对应的组成字符的置信度 ##每一个word组成的字符置信度是一个列表 char_scores = [] detailed_char_scores = [] #if not teach_forcing: if not use_beam_search: # 预测的时候采用非强制策略,将前一次的输出,作为下一次的输入,直到标签为EOS_TOKEN时停止 for di in range(1, target_variable.shape[0]): # 最大字符串的长度 decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) loss += criterion(decoder_output, target_variable[di]) # 每次预测一个字符 loss_avg.add(loss) #decoder_attentions[di-1] = decoder_attention.data topv, topi = decoder_output.data.topk(1) char_scores.append(topv.item()) ##预测的topk(1)对应的字符的置信度(经过softmax之后) ni = topi.squeeze(1) decoder_input = ni #if ni.item() == EOS_TOKEN: if ni == EOS_TOKEN: decoded_words.append('<EOS>') decoded_label.append(EOS_TOKEN) break else: #decoded_words.append(converter.decode(ni.item())) decoded_words.append(converter.decode(ni)) decoded_label.append(ni) else: #top_seqs = decoder.beam_search(encoder_outputs, decoder_hidden, beam_size=6, max_len=cfg.SEQUENCE.MAX_LENGTH) top_seqs = beam_search_sevice.beam_search(cfg, mode, decoder, encoder_outputs, decoder_hidden, beam_size=6, max_len=cfg.SEQUENCE.MAX_LENGTH ) top_seq = top_seqs[0] for character in top_seq[1:]: character_index = character[0] if character_index == EOS_TOKEN: char_scores.append(character[1]) detailed_char_scores.append(character[2]) #decoded_words.append('<EOS>') decoded_label.append(EOS_TOKEN) break else: if character_index == 0: decoded_words.append('~') char_scores.append(0.) decoded_label.append(0) else: decoded_words.append(converter.decode(character_index)) decoded_label.append(character_index) char_scores.append(character[1]) detailed_char_scores.append(character[2]) # 计算正确个数 for pred, target in zip(decoded_label, target_variable[1:,:]): if pred == target: n_correct += 1 #if i % 100 == 0: # 每100次输出一次 if i % 2 == 0: # 每100次输出一次 texts = cpu_texts[0] print('pred:%-20s, gt: %-20s' % (decoded_words, texts)) accuracy = n_correct / float(n_total) print('Test loss: %f, accuray: %f' % (loss_avg.val(), accuracy))
def val(encoder, decoder, criterion, batchsize, dataset, teach_forcing=False, max_iter=100): print('Start val') for e, d in zip(encoder.parameters(), decoder.parameters()): e.requires_grad = False d.requires_grad = False encoder.eval() decoder.eval() data_loader = torch.utils.data.DataLoader( dataset, shuffle=False, batch_size=batchsize, num_workers=int(opt.workers)) val_iter = iter(data_loader) n_correct = 0 n_total = 0 loss_avg = utils.averager() max_iter = min(max_iter, len(data_loader)) # max_iter = len(data_loader) - 1 for i in range ( max_iter ): data = val_iter.next() i += 1 cpu_images, cpu_texts = data b = cpu_images.size(0) utils.loadData(image, cpu_images) target_variable = converter.encode(cpu_texts) n_total += len ( cpu_texts [ 0 ]) + 1 # Also accurately predict the EOS stop bit decoded_words = [] decoded_label = [] decoder_attentions = torch.zeros(len(cpu_texts[0]) + 1, opt.max_width) encoder_outputs = encoder ( image ) # cnn+biLstm for feature extraction target_variable = target_variable.cuda() decoder_input = target_variable [ 0 ]. cuda () # Initialize the beginning of the decoder, start output from 0 decoder_hidden = decoder.initHidden(b).cuda() loss = 0.0 if not teach_forcing: # When forecasting, adopt a non-mandatory strategy, and use the previous output as the next input until the label is EOS_TOKEN. for di in range ( 1 , target_variable . shape [ 0 ]): # Maximum string length decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) loss += criterion ( decoder_output , target_variable [ di ]) # predict one character at a time loss_avg.add(loss) decoder_attentions[di-1] = decoder_attention.data topv, topi = decoder_output.data.topk(1) ni = topi . squeeze ( 1 ) decoder_input = ni if ni == EOS_TOKEN: decoded_words.append('<EOS>') decoded_label.append(EOS_TOKEN) break else: decoded_words.append(converter.decode(ni)) decoded_label.append(ni) # Calculate the correct number for pred, target in zip(decoded_label, target_variable[1:,:]): if pred == target: n_correct += 1 if i % 100 == 0 : # output once every 100 times texts = cpu_texts[0] print('pred:%-20s, gt: %-20s' % (decoded_words, texts)) accuracy = n_correct / float(n_total) print('Test loss: %f, accuray: %f' % (loss_avg.val(), accuracy))
def predict(encoder, decoder, criterion, batchsize, dataset, workers=2): for e, d in zip(encoder.parameters(), decoder.parameters()): e.requires_grad = False d.requires_grad = False encoder.eval() decoder.eval() data_loader = torch.utils.data.DataLoader(dataset, shuffle=False, batch_size=batchsize, num_workers=workers) iterator = iter(data_loader) n_correct = 0 # correct characters (including EOS) n_total = 0 # total characters (including EOS) n_current = 0 # current position loss_avg = utils.averager() EOS_TOKEN = 1 # end of sequence for _ in range(len(data_loader)): data = iterator.next() cpu_images, cpu_texts = data b = cpu_images.size(0) image = torch.FloatTensor(batchsize, 3, 1, 1) image = image.cuda() utils.loadData(image, cpu_images) target_variable = converter(alphabet).encode(cpu_texts) target_variable = target_variable.cuda() encoder_outputs = encoder(image) # cnn+biLstm做特征提取 decoder_input = target_variable[0].cuda() # 初始化decoder的开始,从0开始输出 decoder_hidden = decoder.initHidden(b).cuda() loss = 0.0 decoded_words = [] decoded_labels = [] flag = [True] * batchsize for _ in range(batchsize): new_list = [] decoded_words.append(new_list) new_list = [] decoded_labels.append(new_list) for di in range(1, target_variable.shape[0]): # 最大字符串的长度 decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) loss += criterion(decoder_output, target_variable[di]) # 每次预测一个字符 topv, topi = decoder_output.data.topk(1) ni = topi.squeeze() decoder_input = ni for count in range(batchsize): if flag[count]: if ni[count] == EOS_TOKEN: decoded_words[count].append('<EOS>') decoded_labels[count].append(EOS_TOKEN) flag[count] = False else: decoded_words[count].append( converter(alphabet).decode(ni[count])) decoded_labels[count].append(ni[count]) loss_avg.add(loss) for count in range(batchsize): n_total += len(cpu_texts[count]) + 1 # EOS included for pred, target in zip(decoded_labels[count], target_variable[1:, count]): if pred == target: n_correct += 1 texts = cpu_texts[count] print('%d Pred:%-20s, GT: %-20s' % (n_current, decoded_words[count], texts)) n_current += 1 accuracy = n_correct / float(n_total) print('Loss: %f, Accuracy: %f' % (loss_avg.val(), accuracy))