def trainBatch(encoder, decoder, criterion, encoder_optimizer, decoder_optimizer): data = train_iter.next() cpu_images, cpu_texts = data b = cpu_images.size(0) target_variable = converter.encode(cpu_texts) utils.loadData(image, cpu_images) encoder_outputs = encoder(image) # cnn+biLstm做特征提取 target_variable = target_variable.cuda() decoder_input = target_variable[0].cuda() # 初始化decoder的开始,从0开始输出 decoder_hidden = decoder.initHidden(b).cuda() loss = 0.0 for di in range(1, target_variable.shape[0]): decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) loss += criterion(decoder_output, target_variable[di]) # 每次预测一个字符 topv, topi = decoder_output.data.topk(1) ni = topi.squeeze() decoder_input = ni encoder.zero_grad() decoder.zero_grad() loss.backward() encoder_optimizer.step() decoder_optimizer.step() return loss
def trainBatch(encoder, decoder, criterion, encoder_optimizer, decoder_optimizer, teach_forcing_prob=1): ''' target_label:采用后处理的方式,进行编码和对齐,以便进行batch训练 ''' data = train_iter.next() cpu_images, cpu_texts = data b = cpu_images.size(0) target_variable = converter.encode(cpu_texts) utils.loadData(image, cpu_images) encoder_outputs = encoder(image) # cnn+biLstm做特征提取 target_variable = target_variable.cuda() decoder_input = target_variable[0].cuda() # 初始化decoder的开始,从0开始输出 decoder_hidden = decoder.initHidden(b).cuda() loss = 0.0 teach_forcing = True if random.random() > teach_forcing_prob else False if teach_forcing: # 教师强制:将目标label作为下一个输入 for di in range(1, target_variable.shape[0]): # 最大字符串的长度 decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) loss += criterion(decoder_output, target_variable[di]) # 每次预测一个字符 decoder_input = target_variable[di] # Teacher forcing/前一次的输出 else: for di in range(1, target_variable.shape[0]): decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) loss += criterion(decoder_output, target_variable[di]) # 每次预测一个字符 topv, topi = decoder_output.data.topk(1) ni = topi.squeeze() decoder_input = ni encoder.zero_grad() decoder.zero_grad() loss.backward() encoder_optimizer.step() decoder_optimizer.step() return loss
def trainBatch(encoder, decoder, criterion, encoder_optimizer, decoder_optimizer, teach_forcing_prob=1): ''' target_label: post-processing is used to encode and align for batch training ''' data = train_iter.next() cpu_images, cpu_texts = data b = cpu_images.size(0) target_variable = converter.encode(cpu_texts) utils.loadData(image, cpu_images) encoder_outputs = encoder ( image ) # cnn+biLstm for feature extraction target_variable = target_variable.cuda() decoder_input = target_variable [ 0 ]. cuda () # Initialize the beginning of the decoder, start output from 0 decoder_hidden = decoder.initHidden(b).cuda() loss = 0.0 teach_forcing = True if random.random() > teach_forcing_prob else False if teach_forcing: # Teacher Mandatory: Use the target label as the next input for di in range ( 1 , target_variable . shape [ 0 ]): # Maximum string length decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) loss += criterion ( decoder_output , target_variable [ di ]) # predict one character at a time decoder_input = target_variable [ di ] # Teacher forcing/Previous output else: for di in range(1, target_variable.shape[0]): decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) loss += criterion ( decoder_output , target_variable [ di ]) # predict one character at a time topv, topi = decoder_output.data.topk(1) ni = topi . squeeze () decoder_input = ni encoder.zero_grad() decoder.zero_grad() loss.backward() encoder_optimizer.step() decoder_optimizer.step() return loss
def val(net, dataset, criterion, max_iter=100): print('Start val') for p in crnn.parameters(): p.requires_grad = False net.eval() data_loader = torch.utils.data.DataLoader(dataset, shuffle=True, batch_size=opt.batchSize, num_workers=int(opt.workers)) val_iter = iter(data_loader) i = 0 n_correct = 0 loss_avg = utils.averager() max_iter = min(max_iter, len(data_loader)) for i in range(max_iter): data = val_iter.next() i += 1 cpu_images, cpu_texts = data batch_size = cpu_images.size(0) utils.loadData(image, cpu_images) t, l = converter.encode(cpu_texts) utils.loadData(text, t) utils.loadData(length, l) preds = crnn(image) preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size)) cost = criterion(preds, text, preds_size, length) / batch_size loss_avg.add(cost) _, preds = preds.max(2) preds = preds.squeeze(2) preds = preds.transpose(1, 0).contiguous().view(-1) sim_preds = converter.decode(preds.data, preds_size.data, raw=False) for pred, target in zip(sim_preds, cpu_texts): if pred == target.lower(): n_correct += 1 raw_preds = converter.decode(preds.data, preds_size.data, raw=True)[:opt.n_test_disp] for raw_pred, pred, gt in zip(raw_preds, sim_preds, cpu_texts): print('%-20s => %-20s, gt: %-20s' % (raw_pred, pred, gt)) accuracy = n_correct / float(max_iter * opt.batchSize) print('Test loss: %f, accuray: %f' % (loss_avg.val(), accuracy))
def trainBatch(net, criterion, optimizer): data = train_iter.next() cpu_images, cpu_texts = data batch_size = cpu_images.size(0) utils.loadData(image, cpu_images) t, l = converter.encode(cpu_texts) utils.loadData(text, t) utils.loadData(length, l) preds = crnn(image) preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size)) cost = criterion(preds, text, preds_size, length) / batch_size crnn.zero_grad() cost.backward() optimizer.step() return cost
def trainBatch(encoder, decoder, criterion, encoder_optimizer, decoder_optimizer, teach_forcing_prob=1, mode='1D'): ''' target_label:采用后处理的方式,进行编码和对齐,以便进行batch训练 ''' data = train_iter.next() cpu_images, cpu_texts = data b = cpu_images.size(0) ##batch size大小 target_variable = converter.encode(cpu_texts) ##max_length × batch_size target_variable = target_variable.cuda() utils.loadData(image, cpu_images) encoder_outputs = encoder(image) # cnn+biLstm做特征提取 维度:70(280的宽度4倍下采样) × batchsize × 256(隐藏节点个数) if mode=='1D': decoder_input = target_variable[0].cuda() # 初始化decoder的开始,从0开始输出 (batch 个label的开始SOS:0), 维度batch size decoder_hidden = decoder.initHidden(b).cuda() #维度:(1, batch_size, self.hidden_size) else: #print('-----------:',encoder_outputs.size(1)) bos_onehot = np.zeros((encoder_outputs.size(1), 1), dtype=np.int32) ##[B,1] bos_onehot[:, 0] = cfg.SEQUENCE.BOS_TOKEN ##0 decoder_input = torch.tensor(bos_onehot.tolist(), device=gpu_device) ##列表,[B,1],数字0 decoder_hidden = torch.zeros((encoder_outputs.size(1), 256), device=gpu_device) ##[B,256] ##TEACHER_FORCE_RATIO:1 #use_teacher_forcing = True if random.random() < self.cfg.SEQUENCE.TEACHER_FORCE_RATIO else False #target_length = decoder_targets.size(1) ##32,每一个roi mask区域的target label的max_length,统一到一个max_length长度 # if use_teacher_forcing: # # Teacher forcing: Feed the target as the next input # ##维度说明 # ##output:[B,38(char_classes + 结束符)] # ##hidden:[B,256(hidden size)] # ##attn_weights:[B, 1, H*W] # for di in range(target_length): # decoder_output, decoder_hidden, decoder_attention = self.seq_decoder( # decoder_input, decoder_hidden, seq_decoder_input_reshape) # if di == 0: # loss_seq_decoder = self.criterion_seq_decoder(decoder_output, word_targets[:,di]) # else: # loss_seq_decoder += self.criterion_seq_decoder(decoder_output, word_targets[:,di]) # decoder_input = decoder_targets[:, di] # Teacher forcing # else: # # Without teacher forcing: use its own predictions as the next input # for di in range(target_length): # decoder_output, decoder_hidden, decoder_attention = self.seq_decoder( # decoder_input, decoder_hidden, seq_decoder_input_reshape) # topv, topi = decoder_output.topk(1) ##topv是取得top1对应的值,topi是对应的值的索引,维度[B] # decoder_input = topi.squeeze(1).detach() # detach from history as input 维度[B,1] # if di == 0: # loss_seq_decoder = self.criterion_seq_decoder(decoder_output, word_targets[:,di]) # else: # loss_seq_decoder += self.criterion_seq_decoder(decoder_output, word_targets[:,di]) # loss_seq_decoder = loss_seq_decoder.sum() / loss_seq_decoder.size(0) # loss_seq_decoder = 0.2 * loss_seq_decoder loss = 0.0 teach_forcing = True if random.random() > teach_forcing_prob else False #loss = decoder(encoder_outputs, target_variable, target_variable, False, teach_forcing) if teach_forcing: # 教师强制:将目标label作为下一个输入 for di in range(1, target_variable.shape[0]): # 最大字符串的长度 #decoder_output: (Batch_size, output_size) #decoder_hidden: (1, batch_size, self.hidden_size) #decoder_attention(attention的权重): (batch_size, 1, 70) decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) loss += criterion(decoder_output, target_variable[di]) # 每次预测一个字符 decoder_input = target_variable[di] # Teacher forcing/前一次的输出 else: for di in range(1, target_variable.shape[0]): decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) loss += criterion(decoder_output, target_variable[di]) # 每次预测一个字符 topv, topi = decoder_output.data.topk(1) ni = topi.squeeze() decoder_input = ni encoder.zero_grad() decoder.zero_grad() loss.backward() encoder_optimizer.step() decoder_optimizer.step() return loss
def val(cfg, encoder, decoder, criterion, batchsize, dataset, teach_forcing=False, max_iter=100, use_beam_search=False, mode='1D'): print('Start val') for e, d in zip(encoder.parameters(), decoder.parameters()): e.requires_grad = False d.requires_grad = False encoder.eval() decoder.eval() data_loader = torch.utils.data.DataLoader( dataset, shuffle=False, batch_size=batchsize, num_workers=int(opt.workers)) val_iter = iter(data_loader) n_correct = 0 n_total = 0 loss_avg = utils.averager() max_iter = min(max_iter, len(data_loader)) # max_iter = len(data_loader) - 1 for i in range(max_iter): data = val_iter.next() i += 1 cpu_images, cpu_texts = data b = cpu_images.size(0) utils.loadData(image, cpu_images) target_variable = converter.encode(cpu_texts) ##因为batch_size是1,所以这里的n_total是单个text的长度+EOS停止位的预测 n_total += len(cpu_texts[0]) + 1 # 还要准确预测出EOS停止位 decoded_words = [] decoded_label = [] #decoder_attentions = torch.zeros(len(cpu_texts[0]) + 1, opt.max_width) #encoder_outputs = encoder(image) # cnn+biLstm做特征提取 target_variable = target_variable.cuda() #decoder_input = target_variable[0].cuda() # 初始化decoder的开始,从0开始输出 #decoder_hidden = decoder.initHidden(b).cuda() loss = 0.0 encoder_outputs = encoder(image) # cnn+biLstm做特征提取 维度:70(280的宽度4倍下采样) × batchsize × 256(隐藏节点个数) if mode=='1D': decoder_input = target_variable[0].cuda() # 初始化decoder的开始,从0开始输出 (batch 个label的开始SOS:0), 维度batch size decoder_hidden = decoder.initHidden(b).cuda() #维度:(1, batch_size, self.hidden_size) else: bos_onehot = np.zeros((encoder_outputs.size(1), 1), dtype=np.int32) ##[B,1] bos_onehot[:, 0] = cfg.SEQUENCE.BOS_TOKEN ##0 decoder_input = torch.tensor(bos_onehot.tolist(), device=gpu_device) ##列表,[B,1],数字0 decoder_hidden = torch.zeros((encoder_outputs.size(1), 256), device=gpu_device) ##[B,256] ##测试函数val ##inference时words存储的是batch的word,最大长度32; ##decoded_scores是batch的每一个字符的置信度 ##detailed_decoded_scores只有采用beam_search时才有数,不然就是空列表,具体的存放的是指定topk的置信度,即预测每一个字符时保存topk个当前预测字符的置信度 ##例如一张图片预测出10个框(batch是10),每个batch中在预测具体的字符,这样words就是10个,decoded_scores是10个words对应的组成字符的置信度 ##每一个word组成的字符置信度是一个列表 char_scores = [] detailed_char_scores = [] #if not teach_forcing: if not use_beam_search: # 预测的时候采用非强制策略,将前一次的输出,作为下一次的输入,直到标签为EOS_TOKEN时停止 for di in range(1, target_variable.shape[0]): # 最大字符串的长度 decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) loss += criterion(decoder_output, target_variable[di]) # 每次预测一个字符 loss_avg.add(loss) #decoder_attentions[di-1] = decoder_attention.data topv, topi = decoder_output.data.topk(1) char_scores.append(topv.item()) ##预测的topk(1)对应的字符的置信度(经过softmax之后) ni = topi.squeeze(1) decoder_input = ni #if ni.item() == EOS_TOKEN: if ni == EOS_TOKEN: decoded_words.append('<EOS>') decoded_label.append(EOS_TOKEN) break else: #decoded_words.append(converter.decode(ni.item())) decoded_words.append(converter.decode(ni)) decoded_label.append(ni) else: #top_seqs = decoder.beam_search(encoder_outputs, decoder_hidden, beam_size=6, max_len=cfg.SEQUENCE.MAX_LENGTH) top_seqs = beam_search_sevice.beam_search(cfg, mode, decoder, encoder_outputs, decoder_hidden, beam_size=6, max_len=cfg.SEQUENCE.MAX_LENGTH ) top_seq = top_seqs[0] for character in top_seq[1:]: character_index = character[0] if character_index == EOS_TOKEN: char_scores.append(character[1]) detailed_char_scores.append(character[2]) #decoded_words.append('<EOS>') decoded_label.append(EOS_TOKEN) break else: if character_index == 0: decoded_words.append('~') char_scores.append(0.) decoded_label.append(0) else: decoded_words.append(converter.decode(character_index)) decoded_label.append(character_index) char_scores.append(character[1]) detailed_char_scores.append(character[2]) # 计算正确个数 for pred, target in zip(decoded_label, target_variable[1:,:]): if pred == target: n_correct += 1 #if i % 100 == 0: # 每100次输出一次 if i % 2 == 0: # 每100次输出一次 texts = cpu_texts[0] print('pred:%-20s, gt: %-20s' % (decoded_words, texts)) accuracy = n_correct / float(n_total) print('Test loss: %f, accuray: %f' % (loss_avg.val(), accuracy))
def val(encoder, decoder, criterion, batchsize, dataset, teach_forcing=False, max_iter=100): print('Start val') for e, d in zip(encoder.parameters(), decoder.parameters()): e.requires_grad = False d.requires_grad = False encoder.eval() decoder.eval() data_loader = torch.utils.data.DataLoader(dataset, shuffle=False, batch_size=batchsize, num_workers=int(opt.workers)) val_iter = iter(data_loader) n_correct = 0 n_total = 0 loss_avg = utils.averager() max_iter = min(max_iter, len(data_loader)) # max_iter = len(data_loader) - 1 for i in range(max_iter): data = val_iter.next() i += 1 cpu_images, cpu_texts = data b = cpu_images.size(0) utils.loadData(image, cpu_images) target_variable = converter.encode(cpu_texts) n_total += len(cpu_texts[0]) + 1 # 还要准确预测出EOS停止位 decoded_words = [] decoded_label = [] decoder_attentions = torch.zeros(len(cpu_texts[0]) + 1, opt.max_width) encoder_outputs = encoder(image) # cnn+biLstm做特征提取 target_variable = target_variable.cuda() decoder_input = target_variable[0].cuda() # 初始化decoder的开始,从0开始输出 decoder_hidden = decoder.initHidden(b).cuda() loss = 0.0 if not teach_forcing: # 预测的时候采用非强制策略,将前一次的输出,作为下一次的输入,直到标签为EOS_TOKEN时停止 for di in range(1, target_variable.shape[0]): # 最大字符串的长度 decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) loss += criterion(decoder_output, target_variable[di]) # 每次预测一个字符 loss_avg.add(loss) decoder_attentions[di - 1] = decoder_attention.data topv, topi = decoder_output.data.topk(1) ni = topi.squeeze(1) decoder_input = ni if ni == EOS_TOKEN: decoded_words.append('<EOS>') decoded_label.append(EOS_TOKEN) break else: decoded_words.append(converter.decode(ni)) decoded_label.append(ni) # 计算正确个数 for pred, target in zip(decoded_label, target_variable[1:, :]): if pred == target: n_correct += 1 if i % 100 == 0: # 每100次输出一次 texts = cpu_texts[0] print('pred:%-20s, gt: %-20s' % (decoded_words, texts)) accuracy = n_correct / float(n_total) print('Test loss: %f, accuray: %f' % (loss_avg.val(), accuracy))
def val(encoder, decoder, criterion, batchsize, dataset, teach_forcing=False, max_iter=100): print('Start val') for e, d in zip(encoder.parameters(), decoder.parameters()): e.requires_grad = False d.requires_grad = False encoder.eval() decoder.eval() data_loader = torch.utils.data.DataLoader( dataset, shuffle=False, batch_size=batchsize, num_workers=int(opt.workers)) val_iter = iter(data_loader) n_correct = 0 n_total = 0 loss_avg = utils.averager() max_iter = min(max_iter, len(data_loader)) # max_iter = len(data_loader) - 1 for i in range ( max_iter ): data = val_iter.next() i += 1 cpu_images, cpu_texts = data b = cpu_images.size(0) utils.loadData(image, cpu_images) target_variable = converter.encode(cpu_texts) n_total += len ( cpu_texts [ 0 ]) + 1 # Also accurately predict the EOS stop bit decoded_words = [] decoded_label = [] decoder_attentions = torch.zeros(len(cpu_texts[0]) + 1, opt.max_width) encoder_outputs = encoder ( image ) # cnn+biLstm for feature extraction target_variable = target_variable.cuda() decoder_input = target_variable [ 0 ]. cuda () # Initialize the beginning of the decoder, start output from 0 decoder_hidden = decoder.initHidden(b).cuda() loss = 0.0 if not teach_forcing: # When forecasting, adopt a non-mandatory strategy, and use the previous output as the next input until the label is EOS_TOKEN. for di in range ( 1 , target_variable . shape [ 0 ]): # Maximum string length decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) loss += criterion ( decoder_output , target_variable [ di ]) # predict one character at a time loss_avg.add(loss) decoder_attentions[di-1] = decoder_attention.data topv, topi = decoder_output.data.topk(1) ni = topi . squeeze ( 1 ) decoder_input = ni if ni == EOS_TOKEN: decoded_words.append('<EOS>') decoded_label.append(EOS_TOKEN) break else: decoded_words.append(converter.decode(ni)) decoded_label.append(ni) # Calculate the correct number for pred, target in zip(decoded_label, target_variable[1:,:]): if pred == target: n_correct += 1 if i % 100 == 0 : # output once every 100 times texts = cpu_texts[0] print('pred:%-20s, gt: %-20s' % (decoded_words, texts)) accuracy = n_correct / float(n_total) print('Test loss: %f, accuray: %f' % (loss_avg.val(), accuracy))
def predict(encoder, decoder, criterion, batchsize, dataset, workers=2): for e, d in zip(encoder.parameters(), decoder.parameters()): e.requires_grad = False d.requires_grad = False encoder.eval() decoder.eval() data_loader = torch.utils.data.DataLoader(dataset, shuffle=False, batch_size=batchsize, num_workers=workers) iterator = iter(data_loader) n_correct = 0 # correct characters (including EOS) n_total = 0 # total characters (including EOS) n_current = 0 # current position loss_avg = utils.averager() EOS_TOKEN = 1 # end of sequence for _ in range(len(data_loader)): data = iterator.next() cpu_images, cpu_texts = data b = cpu_images.size(0) image = torch.FloatTensor(batchsize, 3, 1, 1) image = image.cuda() utils.loadData(image, cpu_images) target_variable = converter(alphabet).encode(cpu_texts) target_variable = target_variable.cuda() encoder_outputs = encoder(image) # cnn+biLstm做特征提取 decoder_input = target_variable[0].cuda() # 初始化decoder的开始,从0开始输出 decoder_hidden = decoder.initHidden(b).cuda() loss = 0.0 decoded_words = [] decoded_labels = [] flag = [True] * batchsize for _ in range(batchsize): new_list = [] decoded_words.append(new_list) new_list = [] decoded_labels.append(new_list) for di in range(1, target_variable.shape[0]): # 最大字符串的长度 decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) loss += criterion(decoder_output, target_variable[di]) # 每次预测一个字符 topv, topi = decoder_output.data.topk(1) ni = topi.squeeze() decoder_input = ni for count in range(batchsize): if flag[count]: if ni[count] == EOS_TOKEN: decoded_words[count].append('<EOS>') decoded_labels[count].append(EOS_TOKEN) flag[count] = False else: decoded_words[count].append( converter(alphabet).decode(ni[count])) decoded_labels[count].append(ni[count]) loss_avg.add(loss) for count in range(batchsize): n_total += len(cpu_texts[count]) + 1 # EOS included for pred, target in zip(decoded_labels[count], target_variable[1:, count]): if pred == target: n_correct += 1 texts = cpu_texts[count] print('%d Pred:%-20s, GT: %-20s' % (n_current, decoded_words[count], texts)) n_current += 1 accuracy = n_correct / float(n_total) print('Loss: %f, Accuracy: %f' % (loss_avg.val(), accuracy))