def save(no, dataset_name, output, image, character_map, affinity_map, character_weight, affinity_weight): os.makedirs('Temporary/' + str(no), exist_ok=True) for __, _ in enumerate(dataset_name): os.makedirs('Temporary/'+str(no)+'/'+str(__), exist_ok=True) generated = generate_word_bbox( output[__, 0].data.cpu().numpy(), output[__, 1].data.cpu().numpy(), config.threshold_character, config.threshold_affinity, config.threshold_word, config.threshold_character_upper, config.threshold_affinity_upper, config.scale_character, config.scale_affinity ) output_image = denormalize_mean_variance( image[__].data.cpu().numpy().transpose(1, 2, 0)) cv2.drawContours( output_image, generated['word_bbox'], -1, (0, 255, 0), 2) plt.imsave('Temporary/'+str(no)+'/'+str(__) + '/image_.png', output_image) cv2.imwrite('Temporary/'+str(no)+'/'+str(__)+'/char_map.png', np.uint8(output[__, 0].data.cpu().numpy()*255)) cv2.imwrite('Temporary/'+str(no)+'/'+str(__)+'/aff_map.png', np.uint8(output[__, 1].data.cpu().numpy()*255)) cv2.imwrite( 'Temporary/' + str(no) + '/' + str(__) + '/char_map_threshold_upper.png', np.uint8(np.float32(output[__, 0].data.cpu().numpy() > config.threshold_character_upper) * 255)) cv2.imwrite( 'Temporary/' + str(no) + '/' + str(__) + '/aff_map_threshold_upper.png', np.uint8(np.float32(output[__, 1].data.cpu().numpy() > config.threshold_affinity_upper) * 255)) cv2.imwrite( 'Temporary/' + str(no) + '/' + str(__) + '/char_map_threshold_lower.png', np.uint8(np.float32(output[__, 0].data.cpu().numpy() > config.threshold_character) * 255)) cv2.imwrite( 'Temporary/' + str(no) + '/' + str(__) + '/aff_map_threshold_lower.png', np.uint8(np.float32(output[__, 1].data.cpu().numpy() > config.threshold_affinity) * 255)) cv2.imwrite( 'Temporary/'+str(no)+'/'+str(__)+'/target_char_map.png', np.uint8(character_map[__].data.cpu().numpy()*255)) cv2.imwrite( 'Temporary/'+str(no)+'/'+str(__)+'/target_affinity_map.png', np.uint8(affinity_map[__].data.cpu().numpy()*255)) cv2.imwrite( 'Temporary/'+str(no)+'/'+str(__)+'/weight_char_map.png', np.uint8(character_weight[__].data.cpu().numpy()*255)) cv2.imwrite( 'Temporary/'+str(no)+'/'+str(__)+'/weight_affinity_map.png', np.uint8(affinity_weight[__].data.cpu().numpy()*255))
def synthesize(dataloader, model, base_path_affinity, base_path_character, base_path_bbox): """ Given a path to a set of images, and path to a pre-trained model, generate the character heatmap and affinity heatmap :param dataloader: A Pytorch dataloader for loading and resizing the images of the folder :param model: A pre-trained model :param base_path_affinity: Path where to store the predicted affinity heatmap :param base_path_character: Path where to store the predicted character heatmap :param base_path_bbox: Path where to store the word_bbox overlapped on images :return: None """ with torch.no_grad(): model.eval() iterator = tqdm(dataloader) for no, (image, image_name, original_dim) in enumerate(iterator): if config.use_cuda: image = image.cuda() output = model(image) if type(output) == list: # If using custom DataParallelModel this is necessary to convert the list to tensor output = torch.cat(output, dim=0) output = output.data.cpu().numpy() original_dim = original_dim.cpu().numpy() for i in range(output.shape[0]): # --------- Resizing it back to the original image size and saving it ----------- # image_i = (image[i].data.cpu().numpy() * 255).astype(np.uint8).transpose(1, 2, 0) max_dim = original_dim[i].max() resizing_factor = 768/max_dim before_pad_dim = [int(original_dim[i][0]*resizing_factor), int(original_dim[i][1]*resizing_factor)] output[i, :, :, :] = np.uint8(output[i, :, :, :]*255) height_pad = (768 - before_pad_dim[0])//2 width_pad = (768 - before_pad_dim[1])//2 image_i = cv2.resize( image_i[height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[i][1], original_dim[i][0]) ) character_bbox = cv2.resize( output[i, 0, height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[i][1], original_dim[i][0]) )/255 affinity_bbox = cv2.resize( output[i, 1, height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[i][1], original_dim[i][0]) )/255 predicted_bbox = generate_word_bbox( character_bbox, affinity_bbox, character_threshold=config.threshold_character, affinity_threshold=config.threshold_affinity)['word_bbox'] predicted_bbox = [np.array(predicted_bbox_i) for predicted_bbox_i in predicted_bbox] cv2.drawContours(image_i, predicted_bbox, -1, (0, 255, 0), 2) plt.imsave( base_path_bbox + '/' + '.'.join(image_name[i].split('.')[:-1]) + '.png', image_i) plt.imsave( base_path_character + '/' + '.'.join(image_name[i].split('.')[:-1]) + '.png', np.float32(character_bbox > config.threshold_character), cmap='gray') plt.imsave( base_path_affinity+'/'+'.'.join(image_name[i].split('.')[:-1])+'.png', np.float32(affinity_bbox > config.threshold_affinity), cmap='gray')
def synthesize_with_score(dataloader, model, base_target_path): """ Given a path to a set of images(icdar 2013 dataset), and path to a pre-trained model, generate the character heatmap and affinity heatmap and a json of all the annotations :param dataloader: dataloader for icdar 2013 dataset :param model: pre-trained model :param base_target_path: path where to store the predictions :return: """ with torch.no_grad(): model.eval() iterator = tqdm(dataloader) for no, (image, image_name, original_dim, item) in enumerate(iterator): annots = [] for i in item: annot = dataloader.dataset.gt['annots'][dataloader.dataset.imnames[i]] annots.append(annot) if config.use_cuda: image = image.cuda() output = model(image) if type(output) == list: output = torch.cat(output, dim=0) output = output.data.cpu().numpy() original_dim = original_dim.cpu().numpy() for i in range(output.shape[0]): # --------- Resizing it back to the original image size and saving it ----------- # image_i = (image[i].data.cpu().numpy() * 255).astype(np.uint8).transpose(1, 2, 0) max_dim = original_dim[i].max() resizing_factor = 768/max_dim before_pad_dim = [int(original_dim[i][0]*resizing_factor), int(original_dim[i][1]*resizing_factor)] plt.imsave( base_target_path + '_affinity/'+'.'.join(image_name[i].split('.')[:-1])+'.png', np.float32(output[i, 1, :, :] > config.threshold_affinity), cmap='gray') plt.imsave( base_target_path + '_character/'+'.'.join(image_name[i].split('.')[:-1])+'.png', np.float32(output[i, 0, :, :] > config.threshold_character), cmap='gray') output[i, :, :, :] = np.uint8(output[i, :, :, :]*255) height_pad = (768 - before_pad_dim[0]) // 2 width_pad = (768 - before_pad_dim[1]) // 2 image_i = cv2.resize( image_i[height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[i][1], original_dim[i][0]) ) character_bbox = cv2.resize( output[i, 0, height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[i][1], original_dim[i][0]))/255 affinity_bbox = cv2.resize( output[i, 1, height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[i][1], original_dim[i][0]))/255 generated_targets = generate_word_bbox( character_bbox, affinity_bbox, character_threshold=config.threshold_character, affinity_threshold=config.threshold_affinity) if 'error_message' in generated_targets.keys(): print('There was an error while generating the target of ', image_name[i]) print('Error:', generated_targets['error_message']) continue generated_targets = get_weighted_character_target( generated_targets, {'bbox': annots[i]['bbox'], 'text': annots[i]['text']}, dataloader.dataset.unknown, config.threshold_fscore) cv2.drawContours(image_i, [np.array(word_bbox) for word_bbox in generated_targets['word_bbox']], -1, (0, 255, 0), 2) plt.imsave(base_target_path + '_word_bbox/'+'.'.join(image_name[i].split('.')[:-1])+'.png', image_i) with open(base_target_path + '/' + '.'.join(image_name[i].split('.')[:-1]) + '.json', 'w') as f: json.dump(generated_targets, f)
def synthesize_with_score(dataloader, model, base_target_path): """ Given a path to a set of images(icdar 2013 dataset), and path to a pre-trained model, generate the character heatmap and affinity heatmap and a json of all the annotations :param dataloader: dataloader for icdar 2013 dataset :param model: pre-trained model :param base_target_path: path where to store the predictions :return: """ with torch.no_grad(): model.eval() iterator = tqdm(dataloader) for no, (image, image_name, original_dim, item) in enumerate(iterator): annots = [] for i in item: annot = dataloader.dataset.gt['annots'][ dataloader.dataset.imnames[i]] annots.append(annot) if config.use_cuda: image = image.cuda() output = model(image) if type(output) == list: output = torch.cat(output, dim=0) output = output.data.cpu().numpy() original_dim = original_dim.cpu().numpy() for i in range(output.shape[0]): # --------- Resizing it back to the original image size and saving it ----------- # max_dim = original_dim[i].max() resizing_factor = 768 / max_dim before_pad_dim = [ int(original_dim[i][0] * resizing_factor), int(original_dim[i][1] * resizing_factor) ] output[i, :, :, :] = np.uint8(output[i, :, :, :] * 255) height_pad = (768 - before_pad_dim[0]) // 2 width_pad = (768 - before_pad_dim[1]) // 2 character_bbox = cv2.resize( output[i, 0, height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[i][1], original_dim[i][0])) / 255 affinity_bbox = cv2.resize( output[i, 1, height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[i][1], original_dim[i][0])) / 255 image_i = (image[i].data.cpu().numpy() * 255).astype( np.uint8).transpose(1, 2, 0) image_i = cv2.resize( image_i[height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[i][1], original_dim[i][0])) image_i_backup = image_i.copy() # Generating word-bbox given character and affinity heatmap generated_targets = generate_word_bbox( character_bbox, affinity_bbox, character_threshold=config.threshold_character, affinity_threshold=config.threshold_affinity) if 'error_message' in generated_targets.keys(): print('There was an error while generating the target of ', image_name[i]) print('Error:', generated_targets['error_message']) continue if config.visualize_generated: # Saving affinity heat map plt.imsave( base_target_path + '_predicted/affinity/' + '.'.join(image_name[i].split('.')[:-1]) + '.png', np.float32(affinity_bbox > config.threshold_affinity), cmap='gray') # Saving character heat map plt.imsave( base_target_path + '_predicted/character/' + '.'.join(image_name[i].split('.')[:-1]) + '.png', np.float32( character_bbox > config.threshold_character), cmap='gray') cv2.drawContours(image_i, generated_targets['word_bbox'], -1, (0, 255, 0), 2) # Saving word bbox drawn on the original image plt.imsave( base_target_path + '_predicted/word_bbox/' + '.'.join(image_name[i].split('.')[:-1]) + '.png', image_i) # --------------- PostProcessing for creating the targets for the next iteration ---------------- # generated_targets = get_weighted_character_target( generated_targets, { 'bbox': annots[i]['bbox'], 'text': annots[i]['text'] }, dataloader.dataset.unknown, config.threshold_fscore) if config.visualize_generated: image_i = (image[i].data.cpu().numpy() * 255).astype( np.uint8).transpose(1, 2, 0) image_i = cv2.resize( image_i[height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[i][1], original_dim[i][0])) # Generated word_bbox after postprocessing cv2.drawContours(image_i, generated_targets['word_bbox'], -1, (0, 255, 0), 2) # Saving word bbox after postprocessing plt.imsave( base_target_path + '_next_target/word_bbox/' + '.'.join(image_name[i].split('.')[:-1]) + '.png', image_i) # Generate affinity heatmap after postprocessing affinity_target, affinity_weight_map = generate_target_others( (image_i.shape[0], image_i.shape[1]), generated_targets['affinity'].copy(), generated_targets['weights'].copy()) # Generate character heatmap after postprocessing character_target, characters_weight_map = generate_target_others( (image_i.shape[0], image_i.shape[1]), generated_targets['characters'].copy(), generated_targets['weights'].copy()) # Saving the affinity heatmap plt.imsave(base_target_path + '_next_target/affinity/' + '.'.join(image_name[i].split('.')[:-1]) + '.png', affinity_target, cmap='gray') # Saving the character heatmap plt.imsave(base_target_path + '_next_target/character/' + '.'.join(image_name[i].split('.')[:-1]) + '.png', character_target, cmap='gray') # Saving the affinity weight map plt.imsave( base_target_path + '_next_target/affinity_weight/' + '.'.join(image_name[i].split('.')[:-1]) + '.png', affinity_weight_map, cmap='gray') # Saving the character weight map plt.imsave( base_target_path + '_next_target/character_weight/' + '.'.join(image_name[i].split('.')[:-1]) + '.png', characters_weight_map, cmap='gray') # Saving the target for next iteration in json format generated_targets['word_bbox'] = generated_targets[ 'word_bbox'].tolist() generated_targets['characters'] = [ word_i.tolist() for word_i in generated_targets['characters'] ] generated_targets['affinity'] = [ word_i.tolist() for word_i in generated_targets['affinity'] ] with open( base_target_path + '/' + '.'.join(image_name[i].split('.')[:-1]) + '.json', 'w') as f: json.dump(generated_targets, f)
def synthesize(dataloader, model, base_path_affinity, base_path_character, base_path_bbox, base_path_char, base_path_aff, base_path_json): """ Given a path to a set of images, and path to a pre-trained model, generate the character heatmap and affinity heatmap :param dataloader: A Pytorch dataloader for loading and resizing the images of the folder :param model: A pre-trained model :param base_path_affinity: Path where to store the predicted affinity heatmap :param base_path_character: Path where to store the predicted character heatmap :param base_path_bbox: Path where to store the word_bbox overlapped on images :param base_path_aff: Path where to store the predicted affinity bbox :param base_path_char: Path where to store the predicted character bbox :param base_path_json: Path where to store the predicted bbox in json format :return: None """ with torch.no_grad(): model.eval() iterator = tqdm(dataloader) for no, (image, image_name, original_dim) in enumerate(iterator): if config.use_cuda: image = image.cuda() output = model(image) if type(output) == list: # If using custom DataParallelModel this is necessary to convert the list to tensor output = torch.cat(output, dim=0) output = output.data.cpu().numpy() output[output < 0] = 0 output[output > 1] = 1 original_dim = original_dim.cpu().numpy() for i in range(output.shape[0]): # --------- Resizing it back to the original image size and saving it ----------- # image_i = denormalize_mean_variance( image[i].data.cpu().numpy().transpose(1, 2, 0)) max_dim = original_dim[i].max() resizing_factor = 768 / max_dim before_pad_dim = [ int(original_dim[i][0] * resizing_factor), int(original_dim[i][1] * resizing_factor) ] output[i, :, :, :] = np.uint8(output[i, :, :, :] * 255) height_pad = (768 - before_pad_dim[0]) // 2 width_pad = (768 - before_pad_dim[1]) // 2 image_i = cv2.resize( image_i[height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[i][1], original_dim[i][0])) character_bbox = cv2.resize( output[i, 0, height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[i][1], original_dim[i][0])) / 255 affinity_bbox = cv2.resize( output[i, 1, height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[i][1], original_dim[i][0])) / 255 predicted_bbox = generate_word_bbox( character_bbox, affinity_bbox, character_threshold=config.threshold_character, affinity_threshold=config.threshold_affinity, word_threshold=config.threshold_word, character_threshold_upper=config.threshold_character_upper, affinity_threshold_upper=config.threshold_affinity_upper, scaling_character=config.scale_character, scaling_affinity=config.scale_affinity) word_bbox = predicted_bbox['word_bbox'] char_bbox = np.concatenate(predicted_bbox['characters'], axis=0) aff_bbox = np.concatenate(predicted_bbox['affinity'], axis=0) word_image = image_i.copy() char_image = image_i.copy() aff_image = image_i.copy() cv2.drawContours(word_image, word_bbox, -1, (0, 255, 0), 2) cv2.drawContours(char_image, char_bbox, -1, (0, 255, 0), 2) cv2.drawContours(aff_image, aff_bbox, -1, (0, 255, 0), 2) plt.imsave( base_path_char + '/' + '.'.join(image_name[i].split('.')[:-1]) + '.png', char_image) plt.imsave( base_path_aff + '/' + '.'.join(image_name[i].split('.')[:-1]) + '.png', aff_image) plt.imsave( base_path_bbox + '/' + '.'.join(image_name[i].split('.')[:-1]) + '.png', word_image) plt.imsave( base_path_character + '/' + '.'.join(image_name[i].split('.')[:-1]) + '.png', np.float32(character_bbox > config.threshold_character), cmap='gray') plt.imsave( base_path_affinity + '/' + '.'.join(image_name[i].split('.')[:-1]) + '.png', np.float32(affinity_bbox > config.threshold_affinity), cmap='gray') predicted_bbox['word_bbox'] = predicted_bbox[ 'word_bbox'].tolist() predicted_bbox['characters'] = [ _.tolist() for _ in predicted_bbox['characters'] ] predicted_bbox['affinity'] = [ _.tolist() for _ in predicted_bbox['affinity'] ] with open( base_path_json + '/' + '.'.join(image_name[i].split('.')[:-1]) + '.json', 'w') as f: json.dump(predicted_bbox, f)
def generate_next_targets(original_dim, output, image, base_target_path, image_name, annots, dataloader, no): if 'datapile' in config.dataset_name: image_name = image_name.split('/')[-1] # visualize = config.visualize_generated and no % config.visualize_freq == 0 and no != 0 visualize = config.visualize_generated # Just for debuging max_dim = original_dim.max() resizing_factor = 768 / max_dim before_pad_dim = [ int(original_dim[0] * resizing_factor), int(original_dim[1] * resizing_factor) ] output = np.uint8(output * 255) height_pad = (768 - before_pad_dim[0]) // 2 width_pad = (768 - before_pad_dim[1]) // 2 character_bbox = cv2.resize( output[0, height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[1] // 2, original_dim[0] // 2)) / 255 affinity_bbox = cv2.resize( output[1, height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[1] // 2, original_dim[0] // 2)) / 255 # Generating word-bbox given character and affinity heatmap generated_targets = generate_word_bbox( character_bbox, affinity_bbox, character_threshold=config.threshold_character, affinity_threshold=config.threshold_affinity, word_threshold=config.threshold_word, character_threshold_upper=config.threshold_character_upper, affinity_threshold_upper=config.threshold_affinity_upper, scaling_character=config.scale_character, scaling_affinity=config.scale_affinity) generated_targets['word_bbox'] = generated_targets['word_bbox'] * 2 generated_targets['characters'] = [ i * 2 for i in generated_targets['characters'] ] generated_targets['affinity'] = [ i * 2 for i in generated_targets['affinity'] ] if visualize: character_bbox = cv2.resize((character_bbox * 255).astype(np.uint8), (original_dim[1], original_dim[0])) / 255 affinity_bbox = cv2.resize((affinity_bbox * 255).astype(np.uint8), (original_dim[1], original_dim[0])) / 255 image_i = denormalize_mean_variance(image.data.cpu().numpy().transpose( 1, 2, 0)) image_i = cv2.resize( image_i[height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[1], original_dim[0])) # Saving affinity heat map plt.imsave(base_target_path + '_predicted/affinity/' + '.'.join(image_name.split('.')[:-1]) + '.png', np.float32(affinity_bbox > config.threshold_affinity_upper), cmap='gray') # Saving character heat map plt.imsave( base_target_path + '_predicted/character/' + '.'.join(image_name.split('.')[:-1]) + '.png', np.float32(character_bbox > config.threshold_character_upper), cmap='gray') cv2.drawContours(image_i, generated_targets['word_bbox'], -1, (0, 255, 0), 2) # Saving word bbox drawn on the original image plt.imsave( base_target_path + '_predicted/word_bbox/' + '.'.join(image_name.split('.')[:-1]) + '.png', image_i) predicted_word_bbox = generated_targets['word_bbox'].copy() # --------------- PostProcessing for creating the targets for the next iteration ---------------- # generated_targets = get_weighted_character_target( generated_targets, { 'bbox': annots['bbox'], 'text': annots['text'] }, dataloader.dataset.unknown, config.threshold_fscore, config.weight_threshold) target_word_bbox = generated_targets['word_bbox'].copy() f_score = calculate_fscore( predicted_word_bbox[:, :, 0, :], target_word_bbox[:, :, 0, :], text_target=annots['text'], unknown=dataloader.dataset.gt['unknown'])['f_score'] if visualize: image_i = denormalize_mean_variance(image.data.cpu().numpy().transpose( 1, 2, 0)) image_i = cv2.resize( image_i[height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[1], original_dim[0])) # Generated word_bbox after postprocessing cv2.drawContours(image_i, generated_targets['word_bbox'], -1, (0, 255, 0), 2) # Saving word bbox after postprocessing plt.imsave( base_target_path + '_next_target/word_bbox/' + '.'.join(image_name.split('.')[:-1]) + '.png', image_i) # Generate affinity heatmap after postprocessing affinity_target, affinity_weight_map = generate_target_others( (image_i.shape[0], image_i.shape[1]), generated_targets['affinity'].copy(), np.array(generated_targets['weights'])[:, 1]) # Generate character heatmap after postprocessing character_target, characters_weight_map = generate_target_others( (image_i.shape[0], image_i.shape[1]), generated_targets['characters'].copy(), np.array(generated_targets['weights'])[:, 0]) # Saving the affinity heatmap plt.imsave(base_target_path + '_next_target/affinity/' + '.'.join(image_name.split('.')[:-1]) + '.png', affinity_target, cmap='gray') # Saving the character heatmap plt.imsave(base_target_path + '_next_target/character/' + '.'.join(image_name.split('.')[:-1]) + '.png', character_target, cmap='gray') # Saving the affinity weight map plt.imsave(base_target_path + '_next_target/affinity_weight/' + '.'.join(image_name.split('.')[:-1]) + '.png', affinity_weight_map, cmap='gray') # Saving the character weight map plt.imsave(base_target_path + '_next_target/character_weight/' + '.'.join(image_name.split('.')[:-1]) + '.png', characters_weight_map, cmap='gray') # Saving the target for next iteration in json format generated_targets['word_bbox'] = generated_targets['word_bbox'].tolist() generated_targets['characters'] = [ word_i.tolist() for word_i in generated_targets['characters'] ] generated_targets['affinity'] = [ word_i.tolist() for word_i in generated_targets['affinity'] ] with open(base_target_path + '/' + image_name + '.json', 'w') as f: json.dump(generated_targets, f) return f_score
def synthesize( dataloader, model, base_path_affinity, base_path_character, base_path_bbox, base_path_char, base_path_aff, base_path_json): """ Given a path to a set of images, and path to a pre-trained model, generate the character heatmap and affinity heatmap :param dataloader: A Pytorch dataloader for loading and resizing the images of the folder :param model: A pre-trained model :param base_path_affinity: Path where to store the predicted affinity heatmap :param base_path_character: Path where to store the predicted character heatmap :param base_path_bbox: Path where to store the word_bbox overlapped on images :param base_path_aff: Path where to store the predicted affinity bbox :param base_path_char: Path where to store the predicted character bbox :param base_path_json: Path where to store the predicted bbox in json format :return: None """ with torch.no_grad(): model.eval() iterator = tqdm(dataloader) for no, (image, image_name, original_dim) in enumerate(iterator): if config.use_cuda: image = image.cuda() output = model(image) if type(output) == list: # If using custom DataParallelModel this is necessary to convert the list to tensor output = torch.cat(output, dim=0) output = output.data.cpu().numpy() output[output < 0] = 0 output[output > 1] = 1 original_dim = original_dim.cpu().numpy() for i in range(output.shape[0]): # --------- Resizing it back to the original image size and saving it ----------- # image_i = denormalize_mean_variance(image[i].data.cpu().numpy().transpose(1, 2, 0)) max_dim = original_dim[i].max() resizing_factor = 768/max_dim before_pad_dim = [int(original_dim[i][0]*resizing_factor), int(original_dim[i][1]*resizing_factor)] output[i, :, :, :] = np.uint8(output[i, :, :, :]*255) height_pad = (768 - before_pad_dim[0])//2 width_pad = (768 - before_pad_dim[1])//2 image_i = cv2.resize( image_i[height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[i][1], original_dim[i][0]) ) character_bbox = cv2.resize( output[i, 0, height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[i][1], original_dim[i][0]) )/255 affinity_bbox = cv2.resize( output[i, 1, height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[i][1], original_dim[i][0]) )/255 predicted_bbox = generate_word_bbox( character_bbox, affinity_bbox, character_threshold=config.threshold_character, affinity_threshold=config.threshold_affinity, word_threshold=config.threshold_word, character_threshold_upper=config.threshold_character_upper, affinity_threshold_upper=config.threshold_affinity_upper, scaling_character=config.scale_character, scaling_affinity=config.scale_affinity ) word_bbox = predicted_bbox['word_bbox'] char_bbox = np.concatenate(predicted_bbox['characters'], axis=0) aff_bbox = np.concatenate(predicted_bbox['affinity'], axis=0) word_image = image_i.copy() char_image = image_i.copy() aff_image = image_i.copy() cv2.drawContours(word_image, word_bbox, -1, (0, 255, 0), 2) cv2.drawContours(char_image, char_bbox, -1, (0, 255, 0), 2) cv2.drawContours(aff_image, aff_bbox, -1, (0, 255, 0), 2) # plt.imsave( # base_path_char + '/' + '.'.join(image_name[i].split('.')[:-1]) + '.png', # char_image) # plt.imsave( # base_path_aff + '/' + '.'.join(image_name[i].split('.')[:-1]) + '.png', # aff_image) plt.imsave( base_path_bbox + '/' + '.'.join(image_name[i].split('.')[:-1]) + '.png', word_image) # plt.imsave( # base_path_character + '/' + '.'.join(image_name[i].split('.')[:-1]) + '.png', # np.float32(character_bbox > config.threshold_character), # cmap='gray') # plt.imsave( # base_path_affinity+'/'+'.'.join(image_name[i].split('.')[:-1])+'.png', # np.float32(affinity_bbox > config.threshold_affinity), # cmap='gray') predicted_bbox['word_bbox'] = predicted_bbox['word_bbox'].tolist() predicted_bbox['characters'] = [_.tolist() for _ in predicted_bbox['characters']] predicted_bbox['affinity'] = [_.tolist() for _ in predicted_bbox['affinity']] with open(base_path_json + '/' + '.'.join(image_name[i].split('.')[:-1])+'.json', 'w') as f: json.dump(predicted_bbox, f) boxes_printed = 0 BOX = [] for boxes in predicted_bbox['word_bbox']: for box in boxes: BOX.append(box[0]) BOX1= [] BOX2 = [] count = 1 for boxes in BOX: BOX1.append(boxes) if count%4 ==0: BOX2.append(BOX1) BOX1 = [] count += 1 for box in BOX2: x = [p[0] for p in box] y = [p[1] for p in box] min_x = min(x) max_x = max(x) min_y = min(y) max_y = max(y) img = image_i.copy() crop_img = img[min_y:max_y, min_x:max_x] text = pytesseract.image_to_string(crop_img).rstrip() boxes_printed += 1 f = open("./text_folder" + '/' + '.'.join(image_name[i].split('.')[:-1]) + '.txt', "a") f.write(text) f.write('\n') f.close() f = open("./text_folder" + '/' + '.'.join(image_name[i].split('.')[:-1]) + '.txt', "a") f.write("Total number of boxes : " + str(len(BOX2))) f.write('\n') f.write("Total number of boxes printed: " + str(boxes_printed)) f.close()
def train(model, optimizer, iteration): """ Train the weak-supervised model iteratively :param model: Pre-trained model on SynthText :param optimizer: Pre-trained model's optimizer :param iteration: current iteration of weak-supervision :return: model, optimizer """ def change_lr(): # Change learning rate while training for param_group in optimizer.param_groups: param_group['lr'] = config.lr[iteration] print('Learning Rate Changed to ', config.lr[iteration]) change_lr() dataloader = DataLoader( DataLoaderMIX('train', iteration), batch_size=config.batch_size['train'], num_workers=8, shuffle=True) loss_criterian = DataParallelCriterion(Criterian()) model.train() optimizer.zero_grad() iterator = tqdm(dataloader) all_loss = [] all_accuracy = [] all_count = [] ground_truth = iterator.iterable.dataset.gt for no, (image, character_map, affinity_map, character_weight, affinity_weight, word_bbox, original_dim) in \ enumerate(iterator): if config.use_cuda: image, character_map, affinity_map = image.cuda(), character_map.cuda(), affinity_map.cuda() character_weight, affinity_weight = character_weight.cuda(), affinity_weight.cuda() output = model(image) loss = loss_criterian(output, character_map, affinity_map, character_weight, affinity_weight).mean()/4 all_loss.append(loss.item()*4) loss.backward() if (no + 1) % 4 == 0: optimizer.step() optimizer.zero_grad() # ---------- Calculating the F-score ------------ # if type(output) == list: output = torch.cat(output, dim=0) output = output.data.cpu().numpy() # image = image.data.cpu().numpy() original_dim = original_dim.cpu().numpy() target_bbox = [] predicted_ic13 = [] current_count = 0 word_bbox = word_bbox.numpy() for __, _ in enumerate(word_bbox): if _[1] == 1: # ToDo - Understand why model.train() gives poor results but model.eval() with torch.no_grad() gives better results max_dim = original_dim[__].max() resizing_factor = 768 / max_dim before_pad_dim = [int(original_dim[__][0] * resizing_factor), int(original_dim[__][1] * resizing_factor)] output[__, :, :, :] = np.uint8(output[__, :, :, :] * 255) height_pad = (768 - before_pad_dim[0]) // 2 width_pad = (768 - before_pad_dim[1]) // 2 character_bbox = cv2.resize( output[__, 0, height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[__][1], original_dim[__][0])) / 255 affinity_bbox = cv2.resize( output[__, 1, height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[__][1], original_dim[__][0])) / 255 predicted_bbox = generate_word_bbox( character_bbox, affinity_bbox, character_threshold=config.threshold_character, affinity_threshold=config.threshold_affinity, word_threshold=config.threshold_word)['word_bbox'] predicted_ic13.append(predicted_bbox) target_bbox.append(np.array(ground_truth[_[0] % len(ground_truth)][1]['word_bbox'], dtype=np.int64)) current_count += 1 all_accuracy.append( calculate_batch_fscore( predicted_ic13, target_bbox, threshold=config.threshold_fscore)*current_count ) all_count.append(current_count) # ------------- Setting Description ---------------- # if np.array(all_count)[-min(1000, len(all_count)):].sum() != 0: f_score = int( np.array(all_accuracy)[-min(1000, len(all_accuracy)):].sum() * 100000000 / np.array(all_count)[-min(1000, len(all_count)):].sum()) / 100000000 else: f_score = 0 iterator.set_description( 'Loss:' + str(int(loss.item() * 4 * 100000) / 100000) + ' Iterations:[' + str(no) + '/' + str( len(iterator)) + '] Average Loss:' + str( int(np.array(all_loss)[-min(1000, len(all_loss)):].mean() * 100000) / 100000) + '| Average F-Score: ' + str(f_score) ) if len(iterator) % 4 != 0: optimizer.step() optimizer.zero_grad() torch.cuda.empty_cache() return model, optimizer, all_loss, all_accuracy
def test(model): """ Test the weak-supervised model :param model: Pre-trained model on SynthText :return: F-score, loss """ dataloader = DataLoader( DataLoaderEvalICDAR2013('test'), batch_size=config.batch_size['train'], num_workers=8, shuffle=False) with torch.no_grad(): model.eval() iterator = tqdm(dataloader) all_accuracy = [] ground_truth = dataloader.dataset.gt for no, (image, image_name, original_dim, item) in enumerate(iterator): annots = [] for i in item: annot = ground_truth['annots'][dataloader.dataset.imnames[i]] annots.append(annot) if config.use_cuda: image = image.cuda() output = model(image) if type(output) == list: output = torch.cat(output, dim=0) output = output.data.cpu().numpy() original_dim = original_dim.cpu().numpy() f_score = [] for i in range(output.shape[0]): # --------- Resizing it back to the original image size and saving it ----------- # max_dim = original_dim[i].max() resizing_factor = 768 / max_dim before_pad_dim = [int(original_dim[i][0] * resizing_factor), int(original_dim[i][1] * resizing_factor)] output[i, :, :, :] = np.uint8(output[i, :, :, :] * 255) height_pad = (768 - before_pad_dim[0]) // 2 width_pad = (768 - before_pad_dim[1]) // 2 character_bbox = cv2.resize( output[i, 0, height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[i][1], original_dim[i][0])) / 255 affinity_bbox = cv2.resize( output[i, 1, height_pad:height_pad + before_pad_dim[0], width_pad:width_pad + before_pad_dim[1]], (original_dim[i][1], original_dim[i][0])) / 255 generated_targets = generate_word_bbox( character_bbox, affinity_bbox, character_threshold=config.threshold_character, affinity_threshold=config.threshold_affinity, word_threshold=config.threshold_word) predicted_word_bbox = generated_targets['word_bbox'].copy() f_score.append(calculate_fscore(predicted_word_bbox[:, :, 0, :], np.array(annots[i]['bbox']))) # --------------- PostProcessing for creating the targets for the next iteration ---------------- # all_accuracy.append(np.mean(f_score)) iterator.set_description('F-score: ' + str(np.mean(all_accuracy))) torch.cuda.empty_cache() return np.mean(all_accuracy)